diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index e5becb5d..9a4b7d03 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -13,5 +13,5 @@ jobs: - uses: actions/checkout@v2 - uses: psf/black@stable with: - options: "--check --diff --color" - version: "21.10b0" \ No newline at end of file + options: "--check --diff --color --extend-exclude '/hyp_utils\\/kaldi/'" + version: "21.10b0" diff --git a/egs/chime5_spkdet/v1/local/make_musan.py b/egs/chime5_spkdet/v1/local/make_musan.py index 8a93a18b..b0ae6846 100755 --- a/egs/chime5_spkdet/v1/local/make_musan.py +++ b/egs/chime5_spkdet/v1/local/make_musan.py @@ -7,125 +7,183 @@ import os, sys + def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals def prepare_music(root_dir, fs, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_speech(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_noise(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def main(): - in_dir = sys.argv[1] - fs = int(sys.argv[2]) - out_dir = sys.argv[3] - use_vocals = sys.argv[4] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/egs/chime5_spkdet/v1/local/score_dcf.py b/egs/chime5_spkdet/v1/local/score_dcf.py index 4026d7c9..1137e049 100755 --- a/egs/chime5_spkdet/v1/local/score_dcf.py +++ b/egs/chime5_spkdet/v1/local/score_dcf.py @@ -20,48 +20,54 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0]) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py b/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py index 1c130a63..b77d3595 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-be-diar-v2.py @@ -24,19 +24,27 @@ def combine_diar_scores(ndx, orig_seg, subseg_scores): scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) for j in range(len(ndx.seg_set)): - idx = orig_seg == j #ndx.seg_set[j] + idx = orig_seg == j # ndx.seg_set[j] subseg_scores_j = subseg_scores[:, idx] scores_j = np.max(subseg_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores -def eval_plda(enroll_v_file, test_v_file, ndx_file, enroll_file, - preproc_file, model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + enroll_v_file, + test_v_file, + ndx_file, + enroll_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -45,53 +53,55 @@ def eval_plda(enroll_v_file, test_v_file, ndx_file, enroll_file, tdr = TDR(enroll_v_file, test_v_file, ndx_file, enroll_file, None, preproc) x_e, x_t, enroll, ndx, orig_seg = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - - logging.info('computing llr') + + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") scores = combine_diar_scores(ndx, orig_seg, scores) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with diarization in test') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with diarization in test", + ) - parser.add_argument('--enroll-v-file', required=True) - parser.add_argument('--test-v-file', required=True) - parser.add_argument('--ndx-file', required=True) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--preproc-file', default=None) + parser.add_argument("--enroll-v-file", required=True) + parser.add_argument("--test-v-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--preproc-file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py b/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py index f0b2ab9a..dc3e3f87 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-be-v1.py @@ -22,12 +22,19 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -36,50 +43,51 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py b/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py index 0eaabd1f..fb5dd6f9 100755 --- a/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/eval-calibration-v1.py @@ -24,45 +24,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/chime5_spkdet/v1/steps_be/train-be-v1.py b/egs/chime5_spkdet/v1/steps_be/train-be-v1.py index 1ee8b7ca..55c412ac 100755 --- a/egs/chime5_spkdet/v1/steps_be/train-be-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/train-be-v1.py @@ -21,86 +21,90 @@ from hyperion.helpers import PLDAFactory as F -def train_be(iv_file, train_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): # Read data - logging.info('loading data') + logging.info("loading data") vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr_train.read() # Train LDA - logging.info('train LDA') + logging.info("train LDA") t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening - logging.info('train length norm') + logging.info("train length norm") t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('length norm elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("length norm elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA - logging.info('train PLDA') + logging.info("train PLDA") t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA elapsed time: %.2f s.' % (time.time()-t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) # Save models - logging.info('saving models') + logging.info("saving models") preproc = TransformList(lda) preproc.append(lnorm) if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end') + fromfile_prefix_chars="@", + description="Train Back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_be(**vars(args)) - + train_be(**vars(args)) diff --git a/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py b/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py index a26b310b..fa1dfcf7 100755 --- a/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py +++ b/egs/chime5_spkdet/v1/steps_be/train-calibration-v1.py @@ -23,63 +23,65 @@ def train_calibration(score_file, key_file, model_file, prior, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) - logging.info('train calibration') + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) lr = LR(prior=prior, verbose=verbose) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) print(tar_cal) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/cifar/v1/local/resnet-cifar.py b/egs/cifar/v1/local/resnet-cifar.py index ebc86b49..6b052266 100755 --- a/egs/cifar/v1/local/resnet-cifar.py +++ b/egs/cifar/v1/local/resnet-cifar.py @@ -28,102 +28,142 @@ from hyperion.torch.trainers import TorchTrainer from hyperion.torch.metrics import CategoricalAccuracy -def main(batch_size, test_batch_size, exp_path, - epochs, num_gpus, log_interval, resume, cifar_vers, **kwargs): - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def main( + batch_size, + test_batch_size, + exp_path, + epochs, + num_gpus, + log_interval, + resume, + cifar_vers, + **kwargs +): + + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) - transform_train = transforms.Compose([ - transforms.RandomCrop(32, padding=4), - transforms.RandomHorizontalFlip(), - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) + transform_train = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ] + ) - transform_test = transforms.Compose([ - transforms.ToTensor(), - transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), - ]) + transform_test = transforms.Compose( + [ + transforms.ToTensor(), + transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), + ] + ) - largs = {'num_workers': 2, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": 2, "pin_memory": True} if num_gpus > 0 else {} if cifar_vers == 10: Dataset = datasets.CIFAR10 else: Dataset = datasets.CIFAR100 - trainset = Dataset( - root='./data', train=True, download=True, transform=transform_train) + root="./data", train=True, download=True, transform=transform_train + ) train_loader = torch.utils.data.DataLoader( - trainset, batch_size=batch_size, shuffle=True, **largs) + trainset, batch_size=batch_size, shuffle=True, **largs + ) testset = Dataset( - root='./data', train=False, download=True, transform=transform_test) + root="./data", train=False, download=True, transform=transform_test + ) test_loader = torch.utils.data.DataLoader( - testset, batch_size=test_batch_size, shuffle=False, **largs) - + testset, batch_size=test_batch_size, shuffle=False, **largs + ) model_args = RNF.filter_args(**kwargs) - model_args['in_channels'] = 3 - model_args['out_units'] = cifar_vers - logging.info('model-args={}'.format(model_args)) + model_args["in_channels"] = 3 + model_args["out_units"] = cifar_vers + logging.info("model-args={}".format(model_args)) model = RNF.create(**model_args) - logging.info('model={}'.format(model)) + logging.info("model={}".format(model)) # classes cifar-10 # classes = ('plane', 'car', 'bird', 'cat', 'deer', # 'dog', 'frog', 'horse', 'ship', 'truck') - opt_args = OF.filter_args(prefix='opt', **kwargs) - logging.info('optim-args={}'.format(opt_args)) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) - logging.info('lr-sched-args={}'.format(lrsch_args)) + opt_args = OF.filter_args(prefix="opt", **kwargs) + logging.info("optim-args={}".format(opt_args)) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) + logging.info("lr-sched-args={}".format(lrsch_args)) optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) loss = nn.CrossEntropyLoss() - metrics = { 'acc': CategoricalAccuracy() } - - trainer = TorchTrainer(model, optimizer, loss, epochs, exp_path, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1)) + metrics = {"acc": CategoricalAccuracy()} + + trainer = TorchTrainer( + model, + optimizer, + loss, + epochs, + exp_path, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='PyTorch CIFAR') - - parser.add_argument('--batch-size', type=int, default=128, - help='input batch size for training') - parser.add_argument('--test-batch-size', type=int, default=100, - help='input batch size for testing') - parser.add_argument('--epochs', type=int, default=300, - help='number of epochs to train') + fromfile_prefix_chars="@", + description="PyTorch CIFAR", + ) + + parser.add_argument( + "--batch-size", type=int, default=128, help="input batch size for training" + ) + parser.add_argument( + "--test-batch-size", type=int, default=100, help="input batch size for testing" + ) + parser.add_argument( + "--epochs", type=int, default=300, help="number of epochs to train" + ) RNF.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') - - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1, - help='random seed') - parser.add_argument('--log-interval', type=int, default=10, - help='how many batches to wait before logging training status') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--exp-path', help='experiment path') - parser.add_argument('--cifar-vers', default=10, type=int, choices=[10, 100], help='CIFAR version') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") + + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1, help="random seed") + parser.add_argument( + "--log-interval", + type=int, + default=10, + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument("--exp-path", help="experiment path") + parser.add_argument( + "--cifar-vers", default=10, type=int, choices=[10, 100], help="CIFAR version" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -134,6 +174,3 @@ def main(batch_size, test_batch_size, exp_path, del args.seed main(**vars(args)) - - - diff --git a/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py b/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py index bc562fd3..25282718 100755 --- a/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py +++ b/egs/dihard2019/v1/steps_diar/eval-ahc-v1.py @@ -5,14 +5,20 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging from pathlib import Path import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.hyp_defs import float_cpu, config_logger @@ -25,17 +31,19 @@ from hyperion.clustering import AHC from hyperion.pdfs import GMMTiedDiagCov as GMM from hyperion.diarization import DiarAHCPLDA as Diar -#from hyperion.pdfs import GMMDiagCov as GMM2 -#from hyperion.pdfs import GMM as GMM3 + +# from hyperion.pdfs import GMMDiagCov as GMM2 +# from hyperion.pdfs import GMM as GMM3 + def make_timestamps(n, win_start, win_length, win_shift, win_shrink): - - t1 = np.asarray([win_start+win_shift*i for i in range(n)]) + + t1 = np.asarray([win_start + win_shift * i for i in range(n)]) t2 = t1 + win_length - win_shrink t1 += win_shrink - t1[t1<0]=0 - assert np.all(t2-t1>0) - timestamps = np.concatenate((t1[:,None],t2[:,None]), axis=1) + t1[t1 < 0] = 0 + assert np.all(t2 - t1 > 0) + timestamps = np.concatenate((t1[:, None], t2[:, None]), axis=1) return timestamps @@ -60,15 +68,15 @@ def load_test_list(test_list, part_idx, num_parts): return keys -def load_feats(key, r_x, r_time, r_vad, - win_start, win_length, win_shift, win_shrink): - +def load_feats(key, r_x, r_time, r_vad, win_start, win_length, win_shift, win_shrink): + x = r_x.read([key])[0] if r_time is not None: timestamps = r_time.load([key])[0] else: timestamps = make_timestamps( - len(x), win_start, win_length, win_shift, win_shrink) + len(x), win_start, win_length, win_shift, win_shrink + ) if r_vad is not None: vad_timestamps = r_vad.read_timestamps([key])[0] @@ -89,22 +97,27 @@ def plot_score_hist(scores, output_file, thr=None, gmm=None): mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) scores_r = scores[mask].ravel() - _, bins, _ = plt.hist(scores_r, 100, - histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5) + _, bins, _ = plt.hist( + scores_r, + 100, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + ) if thr is not None: - plt.axvline(x=thr, color='k') + plt.axvline(x=thr, color="k") if gmm is not None: - prob = np.exp(gmm.log_prob(bins[:,None])) - plt.plot(bins, prob, color='r', - linestyle='solid', linewidth=1.5) - - #plt.title(name) - plt.xlabel('LLR score') + prob = np.exp(gmm.log_prob(bins[:, None])) + plt.plot(bins, prob, color="r", linestyle="solid", linewidth=1.5) + + # plt.title(name) + plt.xlabel("LLR score") plt.grid(True) - #plt.legend() + # plt.legend() plt.savefig(output_file) plt.clf() @@ -116,35 +129,44 @@ def twoGMMcalib_lin(s, niters=20): and array of linearly callibrated log odds ratio scores. """ from scipy.special import softmax + weights = np.array([0.5, 0.5]) means = np.mean(s) + np.std(s) * np.array([-1, 1]) var = np.var(s) for _ in range(niters): - lls = np.log(weights)-0.5*np.log(var) - 0.5*(s[:,np.newaxis]-means)**2/var + lls = ( + np.log(weights) + - 0.5 * np.log(var) + - 0.5 * (s[:, np.newaxis] - means) ** 2 / var + ) gammas = softmax(lls, axis=1) cnts = np.sum(gammas, axis=0) weights = cnts / cnts.sum() means = s.dot(gammas) / cnts - var = ((s**2).dot(gammas) / cnts - means**2).dot(weights) + var = ((s ** 2).dot(gammas) / cnts - means ** 2).dot(weights) - logging.info('niko {} {} {}'.format(means, var, weights)) - threshold = -0.5*(np.log(weights**2/var)-means**2/var).dot([1,-1])/(means/var).dot([1,-1]) - return threshold, lls[:,means.argmax()]-lls[:,means.argmin()] + logging.info("niko {} {} {}".format(means, var, weights)) + threshold = ( + -0.5 + * (np.log(weights ** 2 / var) - means ** 2 / var).dot([1, -1]) + / (means / var).dot([1, -1]) + ) + return threshold, lls[:, means.argmax()] - lls[:, means.argmin()] def unsup_gmm_calibration(scores): mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) - scores_r = scores[mask].ravel()[:,None] # N x 1 + scores_r = scores[mask].ravel()[:, None] # N x 1 gmm_1c = GMM(num_comp=1) gmm_1c.fit(scores_r, epochs=1) - #gmm_2c = GMM( - # mu=np.asarray([[np.max(scores_r), np.min(scores_r)]]).T, + # gmm_2c = GMM( + # mu=np.asarray([[np.max(scores_r), np.min(scores_r)]]).T, # Lambda=gmm_1c.Lambda, pi=np.asarray([0.5, 0.5])) gmm_2c = gmm_1c.split_comp(2) # logging.info('gmm1 {} {} {}'.format(gmm_2c.mu, gmm_2c.Lambda, gmm_2c.pi)) e = gmm_2c.fit(scores_r, epochs=20) - # logging.info('gmm2 {} {} {} {} {} {} {}'.format(gmm_2c.mu, gmm_2c.Lambda, gmm_2c.pi, e, - # np.mean(gmm_2c.log_prob(scores_r)), + # logging.info('gmm2 {} {} {} {} {} {} {}'.format(gmm_2c.mu, gmm_2c.Lambda, gmm_2c.pi, e, + # np.mean(gmm_2c.log_prob(scores_r)), # np.sum(gmm_2c.compute_pz_nat(scores_r), axis=0), # np.sum(gmm_2c.compute_pz_std(scores_r), axis=0))) # k1 = GMM2(num_comp=1) @@ -158,62 +180,77 @@ def unsup_gmm_calibration(scores): # k2 = k1.split_comp(2) # k2.fit(scores_r, epochs=20) # logging.info('k3 {} {} {}'.format(k2.mu, k2.Lambda, k2.pi)) - - + # e = gmm_2c.fit(scores_r, epochs=1) # print(gmm_2c.mu, gmm_2c.Lambda, gmm_2c.pi, e, np.mean(gmm_2c.log_prob(scores_r))) # e = gmm_2c.fit(scores_r, epochs=1) # print(gmm_2c.mu, gmm_2c.Lambda, gmm_2c.pi, e, np.mean(gmm_2c.log_prob(scores_r))) scale = (gmm_2c.mu[0] - gmm_2c.mu[1]) * gmm_2c.Lambda - bias = (gmm_2c.mu[1]**2 - gmm_2c.mu[0]**2) * gmm_2c.Lambda / 2 + np.log(gmm_2c.pi[0]) - np.log(gmm_2c.pi[1]) + bias = ( + (gmm_2c.mu[1] ** 2 - gmm_2c.mu[0] ** 2) * gmm_2c.Lambda / 2 + + np.log(gmm_2c.pi[0]) + - np.log(gmm_2c.pi[1]) + ) scores = scale * scores + bias # scores1 = scale * scores_r + bias # scores2 = gmm_2c.compute_log_pz(scores_r) # scores2 = scores2[:,0] - scores2[:,1] - # t, scores_niko = twoGMMcalib_lin(scores_r.ravel(), niters=20) + # t, scores_niko = twoGMMcalib_lin(scores_r.ravel(), niters=20) # logging.info('scores={} {} {}'.format(scores1, scores2, scores_niko + np.log(gmm_2c.pi[0]) - np.log(gmm_2c.pi[1]))) bic_lambda = 1 n = len(scores_r) dparams = 4 - bic = np.mean(gmm_2c.log_prob(scores_r) - gmm_1c.log_prob(scores_r)) - bic_lambda * dparams * np.log(n)/2/n + bic = ( + np.mean(gmm_2c.log_prob(scores_r) - gmm_1c.log_prob(scores_r)) + - bic_lambda * dparams * np.log(n) / 2 / n + ) return scores, bic, gmm_2c - -def do_clustering(x, t_preproc, plda_model, threshold, pca_var_r, - do_unsup_cal, use_bic, - hist_file=None): +def do_clustering( + x, + t_preproc, + plda_model, + threshold, + pca_var_r, + do_unsup_cal, + use_bic, + hist_file=None, +): x = t_preproc.predict(x) if pca_var_r < 1: pca = PCA(pca_var_r=pca_var_r, whiten=True) pca.fit(x) - logging.info('PCA dim=%d' % pca.pca_dim) + logging.info("PCA dim=%d" % pca.pca_dim) x = pca.predict(x) x = LNorm().predict(x) plda_model = plda_model.project(pca.T, pca.mu) - + scores = plda_model.llr_1vs1(x, x) if do_unsup_cal: scores_cal, bic, gmm_2c = unsup_gmm_calibration(scores) - logging.info('UnsupCal. BIC={} gmm.pi={} gmm.mu={} gmm.sigma={}'.format( - bic, gmm_2c.pi, gmm_2c.mu, np.sqrt(1./gmm_2c.Lambda))) + logging.info( + "UnsupCal. BIC={} gmm.pi={} gmm.mu={} gmm.sigma={}".format( + bic, gmm_2c.pi, gmm_2c.mu, np.sqrt(1.0 / gmm_2c.Lambda) + ) + ) if hist_file: - hist_file_1 = '%s-nocal.pdf' % hist_file + hist_file_1 = "%s-nocal.pdf" % hist_file plot_score_hist(scores, hist_file_1, None, gmm_2c) scores = scores_cal if hist_file: - hist_file_1 = '%s.pdf' % hist_file + hist_file_1 = "%s.pdf" % hist_file plot_score_hist(scores, hist_file_1, threshold) if use_bic and bic < 0: # unsup calibration detected only one Gaussian -> only target trials class_ids = np.zeros(len(x), dtype=np.int) return class_ids - + ahc = AHC() ahc.fit(scores) class_ids = ahc.get_flat_clusters(threshold) @@ -221,20 +258,31 @@ def do_clustering(x, t_preproc, plda_model, threshold, pca_var_r, return class_ids -def eval_ahc(test_list, v_file, timestamps_file, vad_file, - preproc_file, rttm_file, - win_start=None, win_length=None, win_shift=None, win_shrink=0, - score_hist_dir=None, - part_idx=1, num_parts=1, **kwargs): - - logging.info('reading utterance list %s' % test_list) +def eval_ahc( + test_list, + v_file, + timestamps_file, + vad_file, + preproc_file, + rttm_file, + win_start=None, + win_length=None, + win_shift=None, + win_shrink=0, + score_hist_dir=None, + part_idx=1, + num_parts=1, + **kwargs +): + + logging.info("reading utterance list %s" % test_list) keys = load_test_list(test_list, part_idx, num_parts) - logging.info('init data readers') + logging.info("init data readers") r_x, r_time, r_vad = init_readers(v_file, timestamps_file, vad_file, **kwargs) - logging.info('loading embedding preprocessor: %s' % (preproc_file)) + logging.info("loading embedding preprocessor: %s" % (preproc_file)) t_preproc = TransformList.load(preproc_file) plda_args = F.filter_eval_args(**kwargs) - logging.info('loading plda model={}'.format(plda_args)) + logging.info("loading plda model={}".format(plda_args)) plda_model = F.load_plda(**plda_args) diar_args = Diar.filter_args(**kwargs) diarizer = Diar(plda_model, t_preproc, **diar_args) @@ -246,14 +294,14 @@ def eval_ahc(test_list, v_file, timestamps_file, vad_file, hist_file = None rttms = [] - + for key in keys: - logging.info('loading data for utt %s' % (key)) + logging.info("loading data for utt %s" % (key)) x, timestamps, ts2segs = load_feats( - key, r_x, r_time, r_vad, - win_start, win_length, win_shift, win_shrink) + key, r_x, r_time, r_vad, win_start, win_length, win_shift, win_shrink + ) - logging.info('clustering utt {} x={}'.format(key, x.shape)) + logging.info("clustering utt {} x={}".format(key, x.shape)) if score_hist_dir is not None: hist_file = score_hist_dir / key @@ -261,10 +309,11 @@ def eval_ahc(test_list, v_file, timestamps_file, vad_file, # seg_class_ids = do_clustering( # x, t_preproc, plda_model, threshold, pca_var_r, do_unsup_cal, use_bic, hist_file) ts_class_ids = seg_class_ids[ts2segs] - logging.info('utt %s found %d spks' % (key, np.max(seg_class_ids)+1)) + logging.info("utt %s found %d spks" % (key, np.max(seg_class_ids) + 1)) rttm = RTTM.create_spkdiar_single_file( - key, timestamps[:,0], timestamps[:,1]-timestamps[:,0], ts_class_ids) + key, timestamps[:, 0], timestamps[:, 1] - timestamps[:, 0], ts_class_ids + ) rttm.merge_adjacent_segments() rttms.append(rttm) @@ -274,39 +323,45 @@ def eval_ahc(test_list, v_file, timestamps_file, vad_file, if __name__ == "__main__": - parser=ArgumentParser( - description='Evals AHC with PLDA scoring') + parser = ArgumentParser(description="Evals AHC with PLDA scoring") + + parser.add_argument("--test-list", required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--timestamps-file", default=None) + parser.add_argument("--vad-file", default=None) + parser.add_argument("--preproc-file", default=None) + VRF.add_argparse_args(parser, prefix="vad") - parser.add_argument('--test-list', required=True) - parser.add_argument('--v-file', required=True) - parser.add_argument('--timestamps-file', default=None) - parser.add_argument('--vad-file', default=None) - parser.add_argument('--preproc-file', default=None) - VRF.add_argparse_args(parser, prefix='vad') - F.add_argparse_eval_args(parser) Diar.add_argparse_args(parser) - parser.add_argument('--win-start', default=-0.675, type=float) - parser.add_argument('--win-length', default=1.5, type=float) - parser.add_argument('--win-shift', default=0.25, type=float) - parser.add_argument('--win-shrink', default=0.675, type=float) - #parser.add_argument('--threshold', default=0, type=float) - #parser.add_argument('--pca-var-r', default=1, type=float) - #parser.add_argument('--do-unsup-cal', default=False, action='store_true') - #parser.add_argument('--use-bic', default=False, action='store_true') - - parser.add_argument('--part-idx', type=int, default=1, - help=('splits the list of files in num-parts ' - 'and process part_idx')) - parser.add_argument('--num-parts', type=int, default=1, - help=('splits the list of files in num-parts ' - 'and process part_idx')) - - parser.add_argument('--rttm-file', required=True) - parser.add_argument('--score-hist-dir', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--win-start", default=-0.675, type=float) + parser.add_argument("--win-length", default=1.5, type=float) + parser.add_argument("--win-shift", default=0.25, type=float) + parser.add_argument("--win-shrink", default=0.675, type=float) + # parser.add_argument('--threshold', default=0, type=float) + # parser.add_argument('--pca-var-r', default=1, type=float) + # parser.add_argument('--do-unsup-cal', default=False, action='store_true') + # parser.add_argument('--use-bic', default=False, action='store_true') + + parser.add_argument( + "--part-idx", + type=int, + default=1, + help=("splits the list of files in num-parts " "and process part_idx"), + ) + parser.add_argument( + "--num-parts", + type=int, + default=1, + help=("splits the list of files in num-parts " "and process part_idx"), + ) + + parser.add_argument("--rttm-file", required=True) + parser.add_argument("--score-hist-dir", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -314,4 +369,3 @@ def eval_ahc(test_list, v_file, timestamps_file, vad_file, logging.debug(args) eval_ahc(**vars(args)) - diff --git a/egs/dihard2019/v1/steps_diar/train-plda-v1.py b/egs/dihard2019/v1/steps_diar/train-plda-v1.py index ed033bac..c7589c8a 100755 --- a/egs/dihard2019/v1/steps_diar/train-plda-v1.py +++ b/egs/dihard2019/v1/steps_diar/train-plda-v1.py @@ -7,7 +7,12 @@ import logging import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import numpy as np @@ -15,45 +20,48 @@ from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info -#from hyperion.helpers import VectorClassReader as VCR + +# from hyperion.helpers import VectorClassReader as VCR from hyperion.transforms import TransformList, LDA, LNorm, PCA from hyperion.helpers import PLDAFactory as F from hyperion.io import RandomAccessDataReaderFactory as DRF from numpy.linalg import matrix_rank + def load_train_list(train_list, inter_session): - u2c = Utt2Info.load(train_list, sep=' ') + u2c = Utt2Info.load(train_list, sep=" ") if inter_session: class_ids = u2c.info else: - class_ids = ['%s-%s' % (k,s) for k,s in zip(u2c.key, u2c.info)] - + class_ids = ["%s-%s" % (k, s) for k, s in zip(u2c.key, u2c.info)] + _, class_ids = np.unique(u2c.info, return_inverse=True) return u2c.key, class_ids -#from memory_profiler import profile -#@profile + +# from memory_profiler import profile +# @profile from pympler.classtracker import ClassTracker from pympler import asizeof + def load_vectors(v_file, keys, class_ids, subsampling): - + x = [] out_class_ids = [] num_files = 0 num_read = 0 with DRF.create(v_file) as reader: - #tracker = ClassTracker() - #tracker.track_object(reader) + # tracker = ClassTracker() + # tracker.track_object(reader) for key, class_id in zip(keys, class_ids): x_i = reader.read(key)[0] if subsampling > 1: - x_i = x_i[::subsampling,:].copy() + x_i = x_i[::subsampling, :].copy() - if len(x_i)==0: - logging.info('read empty matrix from key={}'.format( - key, x_i.shape)) + if len(x_i) == 0: + logging.info("read empty matrix from key={}".format(key, x_i.shape)) continue x.append(x_i) @@ -61,48 +69,60 @@ def load_vectors(v_file, keys, class_ids, subsampling): out_class_ids += [class_id] * num_read_i num_files += 1 num_read += num_read_i - logging.info('read vectors from key={} with shape={}'.format( - key, x_i.shape)) + logging.info( + "read vectors from key={} with shape={}".format(key, x_i.shape) + ) # logging.info('read vectors from key={} with shape={} {} {}'.format( # key, x_i.shape, np.sum(np.isnan(x_i)), matrix_rank(x_i))) - logging.info('total read files={} vectors={}'.format( - num_files, num_read)) + logging.info("total read files={} vectors={}".format(num_files, num_read)) assert not np.any(np.isnan(x_i)) # if num_files > 60000: # break - #tracker.create_snapshot() + # tracker.create_snapshot() # logging.info('1 {}'.format(asizeof.asized(x, detail=1).format())) # logging.info('2 {}'.format(asizeof.asized(x_i, detail=1).format())) # logging.info('3 {} {} {}'.format(x_i.shape[0]*x_i.shape[1]*x_i.itemsize, x_i.nbytes, x_i.size*x_i.itemsize)) # logging.info('4 {} {} {}'.format(xb.shape[0]*xb.shape[1]*xb.itemsize, xb.nbytes, xb.size*xb.itemsize)) - #tracker.stats.print_summary() + # tracker.stats.print_summary() x = np.concatenate(tuple(x), axis=0) out_class_ids = np.asarray(out_class_ids, dtype=np.int) return x, out_class_ids -def train_plda(v_file, train_list, - lda_dim, plda_type, y_dim, z_dim, epochs, ml_md, md_epochs, - output_path, inter_session, subsampling, **kwargs): +def train_plda( + v_file, + train_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + inter_session, + subsampling, + **kwargs +): keys, class_ids = load_train_list(train_list, inter_session) - logging.info('reading {} utts with {} classes'.format( - len(keys), np.max(class_ids)+1)) + logging.info( + "reading {} utts with {} classes".format(len(keys), np.max(class_ids) + 1) + ) x, class_ids = load_vectors(v_file, keys, class_ids, subsampling) - logging.info('read x={} with num_classes={}'.format( - x.shape, np.max(class_ids)+1)) + logging.info("read x={} with num_classes={}".format(x.shape, np.max(class_ids) + 1)) t1 = time.time() - #logging.info('%d %d' % (np.sum(np.isnan(x)), np.sum(np.isinf(x)))) + # logging.info('%d %d' % (np.sum(np.isnan(x)), np.sum(np.isinf(x)))) rank = PCA.get_pca_dim_for_var_ratio(x, 1) - logging.info('x-rank=%d' % (rank)) + logging.info("x-rank=%d" % (rank)) pca = None if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - logging.info('PCA rank=%d' % (rank)) - pca = PCA(pca_dim=rank, name='pca') + logging.info("PCA rank=%d" % (rank)) + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if lda_dim > rank: @@ -111,28 +131,28 @@ def train_plda(v_file, train_list, y_dim = rank # Train LDA - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x = lda.predict(x) - logging.info('PCA-LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PCA-LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x) x = lnorm.predict(x) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo, elbo_norm = plda.fit(x, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo, elbo_norm = plda.fit( + x, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs + ) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models if pca is None: @@ -143,46 +163,55 @@ def train_plda(v_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") print(epochs, elbo.shape) - pd.DataFrame({ 'epochs': np.arange(1, epochs+1, dtype=np.int), - 'elbo': elbo, - 'elbo_per_sample': elbo_norm}).to_csv( - output_path + '/elbo.csv', index=False) - - + pd.DataFrame( + { + "epochs": np.arange(1, epochs + 1, dtype=np.int), + "elbo": elbo, + "elbo_per_sample": elbo_norm, + } + ).to_csv(output_path + "/elbo.csv", index=False) + + if __name__ == "__main__": - parser=ArgumentParser( - description='Train LDA/PLDA back-end for diarization') + parser = ArgumentParser(description="Train LDA/PLDA back-end for diarization") + + parser.add_argument("--v-file", required=True, help="embedding read specifier") + parser.add_argument( + "--train-list", required=True, help="train list utterance spkid" + ) - parser.add_argument('--v-file', required=True, - help='embedding read specifier') - parser.add_argument('--train-list', required=True, - help='train list utterance spkid') - F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', required=True) - parser.add_argument('--lda-dim', type=int, - default=None) - parser.add_argument('--inter-session', default=False, action='store_true', - help=('if True, model inter-session variability, ' - 'if False, model intra-session variability')) - parser.add_argument('--subsampling', default=1, type=int, - help=('subsamples the embeddings to reduce memory and ' - 'computing cost')) - - parser.add_argument('-v', '--verbose', dest='verbose', - default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", required=True) + parser.add_argument("--lda-dim", type=int, default=None) + parser.add_argument( + "--inter-session", + default=False, + action="store_true", + help=( + "if True, model inter-session variability, " + "if False, model intra-session variability" + ), + ) + parser.add_argument( + "--subsampling", + default=1, + type=int, + help=("subsamples the embeddings to reduce memory and " "computing cost"), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_plda(**vars(args)) - + train_plda(**vars(args)) diff --git a/egs/mnist/v1/local/main.py b/egs/mnist/v1/local/main.py index 4b7b3226..04129e3a 100755 --- a/egs/mnist/v1/local/main.py +++ b/egs/mnist/v1/local/main.py @@ -20,107 +20,160 @@ from hyperion.hyp_defs import config_logger from hyperion.torch.utils import open_device -from hyperion.torch.narchs import FCNetV1, TDNNV1, ETDNNV1, ResETDNNV1, LResNet18, LResNet50, LResNext50_4x4d +from hyperion.torch.narchs import ( + FCNetV1, + TDNNV1, + ETDNNV1, + ResETDNNV1, + LResNet18, + LResNet50, + LResNext50_4x4d, +) from hyperion.torch.transforms import Reshape from hyperion.torch.helpers import OptimizerFactory as OF from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.trainers import TorchTrainer from hyperion.torch.metrics import CategoricalAccuracy -input_width=28 -input_height=28 +input_width = 28 +input_height = 28 def create_net(net_type): - if net_type == 'fcnet': - return FCNetV1(2, input_width*input_height, 1000, 10, dropout_rate=0.5) - if net_type == 'tdnn': - return TDNNV1(2, input_height, 1000, 10, dropout_rate=0.5, pooling='mean') - if net_type == 'etdnn': - return ETDNNV1(2, input_height, 1000, 10, dropout_rate=0.5, pooling='mean') - if net_type == 'resetdnn': - return ResETDNNV1(3, input_height, 1000, 1000, 10, dropout_rate=0.5, pooling='mean') - if net_type == 'lresnet18': + if net_type == "fcnet": + return FCNetV1(2, input_width * input_height, 1000, 10, dropout_rate=0.5) + if net_type == "tdnn": + return TDNNV1(2, input_height, 1000, 10, dropout_rate=0.5, pooling="mean") + if net_type == "etdnn": + return ETDNNV1(2, input_height, 1000, 10, dropout_rate=0.5, pooling="mean") + if net_type == "resetdnn": + return ResETDNNV1( + 3, input_height, 1000, 1000, 10, dropout_rate=0.5, pooling="mean" + ) + if net_type == "lresnet18": return LResNet18(1, out_units=10, dropout_rate=0.5) - if net_type == 'lresnet50': + if net_type == "lresnet50": return LResNet50(1, out_units=10, dropout_rate=0.5) - if net_type == 'lresnext50': + if net_type == "lresnext50": return LResNext50_4x4d(1, out_units=10, dropout_rate=0.5) - -def main(net_type, batch_size, test_batch_size, exp_path, - epochs, num_gpus, log_interval, resume, **kwargs): - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) +def main( + net_type, + batch_size, + test_batch_size, + exp_path, + epochs, + num_gpus, + log_interval, + resume, + **kwargs +): + + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) device = open_device(num_gpus=num_gpus) - transform_list = [transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,))] - if net_type == 'fcnet': + transform_list = [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] + if net_type == "fcnet": transform_list.append(Reshape((-1,))) - elif net_type == 'tdnn' or net_type == 'etdnn' or net_type == 'resetdnn': - transform_list.append(Reshape((input_height,input_width))) + elif net_type == "tdnn" or net_type == "etdnn" or net_type == "resetdnn": + transform_list.append(Reshape((input_height, input_width))) transform = transforms.Compose(transform_list) - - largs = {'num_workers': 1, 'pin_memory': True} if num_gpus>0 else {} + + largs = {"num_workers": 1, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - datasets.MNIST('./exp/data', train=True, download=True, - transform=transform), - batch_size=args.batch_size, shuffle=True, **largs) + datasets.MNIST("./exp/data", train=True, download=True, transform=transform), + batch_size=args.batch_size, + shuffle=True, + **largs + ) test_loader = torch.utils.data.DataLoader( - datasets.MNIST('./exp/data', train=False, transform=transform), - batch_size=args.test_batch_size, shuffle=False, **largs) + datasets.MNIST("./exp/data", train=False, transform=transform), + batch_size=args.test_batch_size, + shuffle=False, + **largs + ) model = create_net(net_type) - #model.to(device) + # model.to(device) print(opt_args) print(lrsch_args) optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) loss = nn.CrossEntropyLoss() - metrics = { 'acc': CategoricalAccuracy() } - - trainer = TorchTrainer(model, optimizer, loss, epochs, exp_path, device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1)) + metrics = {"acc": CategoricalAccuracy()} + + trainer = TorchTrainer( + model, + optimizer, + loss, + epochs, + exp_path, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='PyTorch MNIST') - parser.add_argument('--net-type', default='fcnet', metavar='N', - help='Type of network architecture') - - parser.add_argument('--batch-size', type=int, default=64, - help='input batch size for training (default: 64)') - parser.add_argument('--test-batch-size', type=int, default=1000, - help='input batch size for testing (default: 1000)') - parser.add_argument('--epochs', type=int, default=10, - help='number of epochs to train (default: 10)') - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1, - help='random seed (default: 1)') - parser.add_argument('--log-interval', type=int, default=10, - help='how many batches to wait before logging training status') - - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - - parser.add_argument('--exp-path', help='experiment path') - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + fromfile_prefix_chars="@", + description="PyTorch MNIST", + ) + parser.add_argument( + "--net-type", default="fcnet", metavar="N", help="Type of network architecture" + ) + + parser.add_argument( + "--batch-size", + type=int, + default=64, + help="input batch size for training (default: 64)", + ) + parser.add_argument( + "--test-batch-size", + type=int, + default=1000, + help="input batch size for testing (default: 1000)", + ) + parser.add_argument( + "--epochs", type=int, default=10, help="number of epochs to train (default: 10)" + ) + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1, help="random seed (default: 1)") + parser.add_argument( + "--log-interval", + type=int, + default=10, + help="how many batches to wait before logging training status", + ) + + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + + parser.add_argument("--exp-path", help="experiment path") + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -131,6 +184,3 @@ def main(net_type, batch_size, test_batch_size, exp_path, del args.seed main(**vars(args)) - - - diff --git a/egs/mnist/v1/local/resnet-mnist.py b/egs/mnist/v1/local/resnet-mnist.py index 80262998..b176ac73 100755 --- a/egs/mnist/v1/local/resnet-mnist.py +++ b/egs/mnist/v1/local/resnet-mnist.py @@ -28,87 +28,121 @@ from hyperion.torch.trainers import TorchTrainer from hyperion.torch.metrics import CategoricalAccuracy -input_width=28 -input_height=28 +input_width = 28 +input_height = 28 - -def main(batch_size, test_batch_size, exp_path, - epochs, num_gpus, log_interval, resume, **kwargs): - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) +def main( + batch_size, + test_batch_size, + exp_path, + epochs, + num_gpus, + log_interval, + resume, + **kwargs +): + + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) device = open_device(num_gpus=num_gpus) - transform_list = [transforms.ToTensor(), - transforms.Normalize((0.1307,), (0.3081,))] + transform_list = [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] transform = transforms.Compose(transform_list) - - largs = {'num_workers': 2, 'pin_memory': True} if num_gpus>0 else {} + + largs = {"num_workers": 2, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - datasets.MNIST('./exp/data', train=True, download=True, - transform=transform), - batch_size=args.batch_size, shuffle=True, **largs) + datasets.MNIST("./exp/data", train=True, download=True, transform=transform), + batch_size=args.batch_size, + shuffle=True, + **largs + ) test_loader = torch.utils.data.DataLoader( - datasets.MNIST('./exp/data', train=False, transform=transform), - batch_size=args.test_batch_size, shuffle=False, **largs) + datasets.MNIST("./exp/data", train=False, transform=transform), + batch_size=args.test_batch_size, + shuffle=False, + **largs + ) model_args = RNF.filter_args(**kwargs) - model_args['in_channels'] = 1 - model_args['out_units'] = 10 - logging.info('model-args={}'.format(model_args)) + model_args["in_channels"] = 1 + model_args["out_units"] = 10 + logging.info("model-args={}".format(model_args)) model = RNF.create(**model_args) - logging.info('model={}'.format(model)) + logging.info("model={}".format(model)) - opt_args = OF.filter_args(prefix='opt', **kwargs) - logging.info('optim-args={}'.format(opt_args)) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) - logging.info('lr-sched-args={}'.format(lrsch_args)) + opt_args = OF.filter_args(prefix="opt", **kwargs) + logging.info("optim-args={}".format(opt_args)) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) + logging.info("lr-sched-args={}".format(lrsch_args)) optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) loss = nn.CrossEntropyLoss() - metrics = { 'acc': CategoricalAccuracy() } - - trainer = TorchTrainer(model, optimizer, loss, epochs, exp_path, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1)) + metrics = {"acc": CategoricalAccuracy()} + + trainer = TorchTrainer( + model, + optimizer, + loss, + epochs, + exp_path, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='PyTorch MNIST') - - parser.add_argument('--batch-size', type=int, default=128, - help='input batch size for training') - parser.add_argument('--test-batch-size', type=int, default=100, - help='input batch size for testing') - parser.add_argument('--epochs', type=int, default=10, - help='number of epochs to train') + fromfile_prefix_chars="@", + description="PyTorch MNIST", + ) + + parser.add_argument( + "--batch-size", type=int, default=128, help="input batch size for training" + ) + parser.add_argument( + "--test-batch-size", type=int, default=100, help="input batch size for testing" + ) + parser.add_argument( + "--epochs", type=int, default=10, help="number of epochs to train" + ) RNF.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') - - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1, - help='random seed') - parser.add_argument('--log-interval', type=int, default=10, - help='how many batches to wait before logging training status') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--exp-path', help='experiment path') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") + + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1, help="random seed") + parser.add_argument( + "--log-interval", + type=int, + default=10, + help="how many batches to wait before logging training status", + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument("--exp-path", help="experiment path") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -119,6 +153,3 @@ def main(batch_size, test_batch_size, exp_path, del args.seed main(**vars(args)) - - - diff --git a/egs/sre18/v1.8k/local/make_musan.py b/egs/sre18/v1.8k/local/make_musan.py index 8a93a18b..b0ae6846 100755 --- a/egs/sre18/v1.8k/local/make_musan.py +++ b/egs/sre18/v1.8k/local/make_musan.py @@ -7,125 +7,183 @@ import os, sys + def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals def prepare_music(root_dir, fs, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_speech(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_noise(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def main(): - in_dir = sys.argv[1] - fs = int(sys.argv[2]) - out_dir = sys.argv[3] - use_vocals = sys.argv[4] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/egs/sre18/v1.8k/local/rttm2vad.py b/egs/sre18/v1.8k/local/rttm2vad.py index 4370077d..a1960411 100644 --- a/egs/sre18/v1.8k/local/rttm2vad.py +++ b/egs/sre18/v1.8k/local/rttm2vad.py @@ -7,31 +7,33 @@ import numpy as np import pandas as pd -frame_shift=0.01 +frame_shift = 0.01 + def write_vad(f, file_id, vad): - f.write('%s [ ' % (file_id)) + f.write("%s [ " % (file_id)) for i in range(len(vad)): - f.write('%d ' % vad[i]) - f.write(']\n') + f.write("%d " % vad[i]) + f.write("]\n") + - def rttm2vad_file(file_id, rttm, num_frames, fvad, fu2o, min_dur): _, spk_ids = np.unique(rttm.name, return_inverse=True) - num_spks = np.max(spk_ids)+1 + num_spks = np.max(spk_ids) + 1 if len(spk_ids) == 1: vad = np.zeros((num_frames,), dtype=int) - tbeg = np.round(rttm.tbeg/frame_shift).astype('int') - tend = min(np.round((rttm.tbeg+rttm.tdur)/frame_shift).astype('int'), num_frames) - vad[tbeg:tend+1] = 1 - file_dir_id = '%s-d%03d' % (file_id,0) + tbeg = np.round(rttm.tbeg / frame_shift).astype("int") + tend = min( + np.round((rttm.tbeg + rttm.tdur) / frame_shift).astype("int"), num_frames + ) + vad[tbeg : tend + 1] = 1 + file_dir_id = "%s-d%03d" % (file_id, 0) write_vad(fvad, file_dir_id, vad) - fu2o.write('%s %s\n' % (file_dir_id, file_id)) + fu2o.write("%s %s\n" % (file_dir_id, file_id)) return - - + total_dur = np.zeros((num_spks,), dtype=float) for i in range(num_spks): idx = spk_ids == i @@ -42,53 +44,69 @@ def rttm2vad_file(file_id, rttm, num_frames, fvad, fu2o, min_dur): if total_dur[i] >= min_dur or do_all: vad = np.zeros((num_frames,), dtype=int) idx = spk_ids == i - tbeg = np.round(np.array(rttm.tbeg.loc[idx])/frame_shift).astype('int') - tend = np.round(np.array(rttm.tbeg.loc[idx]+rttm.tdur.loc[idx])/frame_shift).astype('int') + tbeg = np.round(np.array(rttm.tbeg.loc[idx]) / frame_shift).astype("int") + tend = np.round( + np.array(rttm.tbeg.loc[idx] + rttm.tdur.loc[idx]) / frame_shift + ).astype("int") for j in range(len(tbeg)): - vad[tbeg[j]:tend[j]+1] = 1 - file_dir_id = '%s-d%03d' % (file_id,i) + vad[tbeg[j] : tend[j] + 1] = 1 + file_dir_id = "%s-d%03d" % (file_id, i) write_vad(fvad, file_dir_id, vad) - fu2o.write('%s %s\n' % (file_dir_id, file_id)) - + fu2o.write("%s %s\n" % (file_dir_id, file_id)) + def rttm2vad(rttm_file, num_frames_file, vad_file, utt2orig, min_dur): - rttm = pd.read_csv(rttm_file, sep='\s+', header=None, - names=['segment_type','file_id','chnl','tbeg','tdur', - 'ortho','stype','name','conf','slat']) + rttm = pd.read_csv( + rttm_file, + sep="\s+", + header=None, + names=[ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ], + ) rttm.index = rttm.file_id - - df_num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, - names=['file_id','num_frames']) + + df_num_frames = pd.read_csv( + num_frames_file, sep="\s+", header=None, names=["file_id", "num_frames"] + ) df_num_frames.index = df_num_frames.file_id + with open(vad_file, "w") as fvad: + with open(utt2orig, "w") as fu2o: - with open(vad_file, 'w') as fvad: - with open(utt2orig, 'w') as fu2o: - for file_id in df_num_frames.file_id: num_frames_i = int(df_num_frames.num_frames.loc[file_id]) print(file_id) rttm_i = rttm.loc[file_id] - file_diars_ids=rttm2vad_file( - file_id, rttm_i, num_frames_i, fvad, fu2o, min_dur) - - + file_diars_ids = rttm2vad_file( + file_id, rttm_i, num_frames_i, fvad, fu2o, min_dur + ) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Converts RTTM to kaldi VAD files') - - parser.add_argument('--rttm',dest='rttm_file', required=True) - parser.add_argument('--num-frames', dest='num_frames_file', required=True) - parser.add_argument('--vad-file', dest='vad_file', required=True) - parser.add_argument('--utt2orig', dest='utt2orig', required=True) - parser.add_argument('--min-dur', dest='min_dur', type=float, default=10) - args=parser.parse_args() - + fromfile_prefix_chars="@", + description="Converts RTTM to kaldi VAD files", + ) + + parser.add_argument("--rttm", dest="rttm_file", required=True) + parser.add_argument("--num-frames", dest="num_frames_file", required=True) + parser.add_argument("--vad-file", dest="vad_file", required=True) + parser.add_argument("--utt2orig", dest="utt2orig", required=True) + parser.add_argument("--min-dur", dest="min_dur", type=float, default=10) + args = parser.parse_args() + rttm2vad(**vars(args)) - diff --git a/egs/sre18/v1.8k/local/score_dcf.py b/egs/sre18/v1.8k/local/score_dcf.py index 4026d7c9..1137e049 100755 --- a/egs/sre18/v1.8k/local/score_dcf.py +++ b/egs/sre18/v1.8k/local/score_dcf.py @@ -20,48 +20,54 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0]) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/sre18/v1.8k/local/sre18_diar_to_vad.py b/egs/sre18/v1.8k/local/sre18_diar_to_vad.py index b4af18db..2347c7d3 100755 --- a/egs/sre18/v1.8k/local/sre18_diar_to_vad.py +++ b/egs/sre18/v1.8k/local/sre18_diar_to_vad.py @@ -14,32 +14,40 @@ if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Diarization file to binary vad') - - parser.add_argument(dest='diar_file') - parser.add_argument(dest='num_frames_file') - - args=parser.parse_args() - - utt2num_frames = pd.read_csv(args.num_frames_file, sep=' ', header=None, names=['utt','num_frames', 'None'], index_col=0) - diar = pd.read_csv(args.diar_file, sep=' ', header=None, names=['utt', 'start','end'], index_col=0) - + fromfile_prefix_chars="@", + description="Diarization file to binary vad", + ) + + parser.add_argument(dest="diar_file") + parser.add_argument(dest="num_frames_file") + + args = parser.parse_args() + + utt2num_frames = pd.read_csv( + args.num_frames_file, + sep=" ", + header=None, + names=["utt", "num_frames", "None"], + index_col=0, + ) + diar = pd.read_csv( + args.diar_file, sep=" ", header=None, names=["utt", "start", "end"], index_col=0 + ) for key in utt2num_frames.index.values: - num_frames_i = utt2num_frames['num_frames'][key] + num_frames_i = utt2num_frames["num_frames"][key] vad = np.zeros((num_frames_i,), dtype=int) - start_i = np.array(diar.loc[key]['start'], dtype=int) - end_i = np.array(diar.loc[key]['end'], dtype=int) - if start_i.ndim==0: + start_i = np.array(diar.loc[key]["start"], dtype=int) + end_i = np.array(diar.loc[key]["end"], dtype=int) + if start_i.ndim == 0: start_i = [start_i] end_i = [end_i] - for s,e in zip(start_i,end_i): - if e > num_frames_i-1: - e = num_frames_i-1 - vad[s:e+1] = 1 - - svad = key + ' [ ' + ' '.join([str(v) for v in vad]) + ' ]' + for s, e in zip(start_i, end_i): + if e > num_frames_i - 1: + e = num_frames_i - 1 + vad[s : e + 1] = 1 + + svad = key + " [ " + " ".join([str(v) for v in vad]) + " ]" print(svad) diff --git a/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py b/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py index 9b8d98b7..fa16dfce 100755 --- a/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-calibration-v1.py @@ -25,45 +25,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py index 74dbe2f0..d3b35fba 100755 --- a/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-tel-be-snorm-v1.py @@ -25,14 +25,22 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, - model_file, score_file, plda_type, - pool_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + model_file, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -42,79 +50,86 @@ def eval_plda(iv_file, ndx_file, enroll_file, x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_Nvs1(x_e, x_coh, method=pool_method, ids1=ids_e) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 telephone condition with S-Norm') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 telephone condition with S-Norm", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py b/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py index 1c8bc03e..d9668e1a 100755 --- a/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-tel-be-v1.py @@ -22,14 +22,20 @@ from hyperion.transforms import TransformList - -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - pool_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,54 +45,59 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 telephone condition') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 telephone condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py index 4676fb71..c37d450a 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-snorm-v1.py @@ -27,24 +27,36 @@ def combine_diar_scores(ndx, diar_ndx, diar2orig, diar_scores): - d2o = SCPList.load(diar2orig, sep=' ') + d2o = SCPList.load(diar2orig, sep=" ") d2o = d2o.filter(diar_ndx.seg_set) scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) for j in range(len(ndx.seg_set)): idx = d2o.file_path == ndx.seg_set[j] diar_scores_j = diar_scores[:, idx] scores_j = np.max(diar_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores -def eval_plda(iv_file, ndx_file, diar_ndx_file, enroll_file, diar2orig, - preproc_file, - coh_iv_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + diar_ndx_file, + enroll_file, + diar2orig, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -53,82 +65,88 @@ def eval_plda(iv_file, ndx_file, diar_ndx_file, enroll_file, diar2orig, tdr = TDR(iv_file, diar_ndx_file, enroll_file, None, preproc) x_e, x_t, enroll, diar_ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") ndx = TrialNdx.load(ndx_file) scores = combine_diar_scores(ndx, diar_ndx, diar2orig, scores) - - logging.info('saving scores to %s' % (score_file)) + + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 Video condition with S-Norm and diarization') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--diar-ndx-file', dest='diar_ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--diar2orig', dest='diar2orig', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', dest='coh_nbest_discard', type=int, default=0) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 Video condition with S-Norm and diarization", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--diar-ndx-file", dest="diar_ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--diar2orig", dest="diar2orig", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument( + "--coh-nbest-discard", dest="coh_nbest_discard", type=int, default=0 + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py index 38bb8531..c19dc074 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-diar-v1.py @@ -26,24 +26,32 @@ def combine_diar_scores(ndx, diar_ndx, diar2orig, diar_scores): - d2o = SCPList.load(diar2orig, sep=' ') + d2o = SCPList.load(diar2orig, sep=" ") d2o = d2o.filter(diar_ndx.seg_set) scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) for j in range(len(ndx.seg_set)): idx = d2o.file_path == ndx.seg_set[j] diar_scores_j = diar_scores[:, idx] scores_j = np.max(diar_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores -def eval_plda(iv_file, ndx_file, diar_ndx_file, enroll_file, diar2orig, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + diar_ndx_file, + enroll_file, + diar2orig, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -52,55 +60,57 @@ def eval_plda(iv_file, ndx_file, diar_ndx_file, enroll_file, diar2orig, tdr = TDR(iv_file, diar_ndx_file, enroll_file, None, preproc) x_e, x_t, enroll, diar_ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - - logging.info('computing llr') + + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") ndx = TrialNdx.load(ndx_file) scores = combine_diar_scores(ndx, diar_ndx, diar2orig, scores) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 Video condition with diarization') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 Video condition with diarization", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--diar-ndx-file', dest='diar_ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--diar2orig', dest='diar2orig', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--diar-ndx-file", dest="diar_ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--diar2orig", dest="diar2orig", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py index 35c596eb..fc94c754 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-snorm-v1.py @@ -25,13 +25,23 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,79 +49,84 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - - logging.info('loading plda model: %s' % (model_file)) + + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) - - logging.info('loading cohort data') + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 Video condition with S-Norm') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', dest='coh_nbest_discard', type=int, default=0) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 Video condition with S-Norm", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument( + "--coh-nbest-discard", dest="coh_nbest_discard", type=int, default=0 + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py b/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py index 63e3a83b..f7d83d30 100755 --- a/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/eval-vid-be-v1.py @@ -22,12 +22,19 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -36,50 +43,51 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 Video condition') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 Video condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre18/v1.8k/steps_be/train-calibration-v1.py b/egs/sre18/v1.8k/steps_be/train-calibration-v1.py index a26b310b..fa1dfcf7 100755 --- a/egs/sre18/v1.8k/steps_be/train-calibration-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-calibration-v1.py @@ -23,63 +23,65 @@ def train_calibration(score_file, key_file, model_file, prior, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) - logging.info('train calibration') + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) lr = LR(prior=prior, verbose=verbose) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) print(tar_cal) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py b/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py index 920d2171..c9f22d83 100755 --- a/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-tel-be-v1.py @@ -18,14 +18,29 @@ from hyperion.utils.utt2info import Utt2Info -def train_be(iv_file, train_list, - adapt_iv_file, adapt_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - w_mu1, w_B1, w_W1, - w_mu2, w_B2, w_W2, num_spks, do_ahc, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + w_mu1, + w_B1, + w_W1, + w_mu2, + w_B2, + w_W2, + num_spks, + do_ahc, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) @@ -35,29 +50,27 @@ def train_be(iv_file, train_list, # Train LDA t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - print('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - print('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + print("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - print('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models preproc = TransformList(lda) @@ -66,13 +79,13 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Compute mean for adapted data vcr = VCR(adapt_iv_file, adapt_list, None) x, class_ids = vcr.read() @@ -83,78 +96,69 @@ def train_be(iv_file, train_list, preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") + x_ln = lnorm.predict(x_lda) plda_adapt1 = plda.copy() plda_adapt2 = plda.copy() - + elbo = plda.fit(x_ln, class_ids, epochs=20) plda_adapt1.weighted_avg_model(plda, w_mu1, w_B1, w_W1) - plda_adapt1.save(output_path + '/plda_adapt1.h5') + plda_adapt1.save(output_path + "/plda_adapt1.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt1.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt1.csv", elbo, delimiter=",") if not do_ahc: return - + scores = plda_adapt1.llr_1vs1(x_ln, x_ln) - ahc = AHC(method='average', metric='llr') + ahc = AHC(method="average", metric="llr") ahc.fit(scores) - class_ids2 = ahc.get_flat_clusters(num_spks, criterion='num_clusters') + class_ids2 = ahc.get_flat_clusters(num_spks, criterion="num_clusters") elbo = plda_adapt1.fit(x_ln, class_ids2, epochs=20) plda_adapt2.weighted_avg_model(plda_adapt1, w_mu2, w_B2, w_W2) - plda_adapt2.save(output_path + '/plda_adapt2.h5') - + plda_adapt2.save(output_path + "/plda_adapt2.h5") + num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt2.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt2.csv", elbo, delimiter=",") + + u2c_out = Utt2Info.create(vcr.u2c.key, class_ids2.astype("U")) + u2c_out.save(output_path + "/output_adapt_spk2utt.scp", sep=" ") + - u2c_out = Utt2Info.create(vcr.u2c.key, class_ids2.astype('U')) - u2c_out.save(output_path + '/output_adapt_spk2utt.scp', sep=' ') - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 telephone condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) - parser.add_argument('--do-ahc', dest='do_ahc', default=False, action='store_true') - + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 telephone condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) + parser.add_argument("--do-ahc", dest="do_ahc", default=False, action="store_true") + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - parser.add_argument('--w-mu1', dest='w_mu1', type=float, - default=1) - parser.add_argument('--w-b1', dest='w_B1', type=float, - default=1) - parser.add_argument('--w-w1', dest='w_W1', type=float, - default=1) - parser.add_argument('--w-mu2', dest='w_mu2', type=float, - default=1) - parser.add_argument('--w-b2', dest='w_B2', type=float, - default=1) - parser.add_argument('--w-w2', dest='w_W2', type=float, - default=1) - parser.add_argument('--num-spks', dest='num_spks', type=int, - default=1000) - - args=parser.parse_args() - - train_be(**vars(args)) - + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--w-mu1", dest="w_mu1", type=float, default=1) + parser.add_argument("--w-b1", dest="w_B1", type=float, default=1) + parser.add_argument("--w-w1", dest="w_W1", type=float, default=1) + parser.add_argument("--w-mu2", dest="w_mu2", type=float, default=1) + parser.add_argument("--w-b2", dest="w_B2", type=float, default=1) + parser.add_argument("--w-w2", dest="w_W2", type=float, default=1) + parser.add_argument("--num-spks", dest="num_spks", type=int, default=1000) + + args = parser.parse_args() + + train_be(**vars(args)) diff --git a/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py b/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py index 88198108..a1b0cad6 100755 --- a/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py +++ b/egs/sre18/v1.8k/steps_be/train-vid-be-v1.py @@ -4,9 +4,6 @@ """ - - - import sys import os import argparse @@ -21,46 +18,54 @@ from hyperion.utils.scp_list import SCPList -def train_be(iv_file, train_list, - adapt_iv_file_1, adapt_list_1, - adapt_iv_file_2, adapt_list_2, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, r2, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file_1, + adapt_list_1, + adapt_iv_file_2, + adapt_list_2, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + r2, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr_train.read() - # Train LDA t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - print('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - print('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + print("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - print('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models preproc = TransformList(lda) @@ -69,68 +74,65 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Compute mean for adapted data vr = VR(adapt_iv_file_1, adapt_list_1, None) x = vr.read() x = lda.predict(x) lnorm.update_T = False lnorm.fit(x) - + preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") # Compute mean for adapted data 2 if adapt_list_2 is None: return - + vr = VR(adapt_iv_file_2, adapt_list_2, None) x = vr.read() x = lda.predict(x) N = x.shape[0] - alpha = N/(N+r2) - lnorm.mu = alpha*np.mean(x, axis=0) + (1-alpha)*lnorm.mu + alpha = N / (N + r2) + lnorm.mu = alpha * np.mean(x, axis=0) + (1 - alpha) * lnorm.mu print(alpha) print(lnorm.mu[:10]) preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt2.h5') + preproc.save(output_path + "/lda_lnorm_adapt2.h5") + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 video condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file-1', dest='adapt_iv_file_1', required=True) - parser.add_argument('--adapt-list-1', dest='adapt_list_1', required=True) - parser.add_argument('--adapt-iv-file-2', dest='adapt_iv_file_2', default=None) - parser.add_argument('--adapt-list-2', dest='adapt_list_2', default=None) - parser.add_argument('--r-2', dest='r2', default=14, type=float) - + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 video condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file-1", dest="adapt_iv_file_1", required=True) + parser.add_argument("--adapt-list-1", dest="adapt_list_1", required=True) + parser.add_argument("--adapt-iv-file-2", dest="adapt_iv_file_2", default=None) + parser.add_argument("--adapt-list-2", dest="adapt_list_2", default=None) + parser.add_argument("--r-2", dest="r2", default=14, type=float) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=None) - args=parser.parse_args() - - train_be(**vars(args)) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + + args = parser.parse_args() - + train_be(**vars(args)) diff --git a/egs/sre18/v1.8k/steps_fe/segments2vad.py b/egs/sre18/v1.8k/steps_fe/segments2vad.py index 016062fa..24262592 100755 --- a/egs/sre18/v1.8k/steps_fe/segments2vad.py +++ b/egs/sre18/v1.8k/steps_fe/segments2vad.py @@ -12,61 +12,66 @@ import numpy as np import pandas as pd -frame_shift=0.01 +frame_shift = 0.01 + def write_vad(f, file_id, vad): - f.write('%s [ ' % (file_id)) + f.write("%s [ " % (file_id)) for i in range(len(vad)): - f.write('%d ' % vad[i]) - f.write(']\n') + f.write("%d " % vad[i]) + f.write("]\n") + - def segments2vad_file(file_id, marks, num_frames, fvad): - tbeg = np.round(np.array(marks.tbeg, ndmin=1)/frame_shift).astype('int') - tend = np.round(np.array(marks.tend, ndmin=1)/frame_shift).astype('int') + tbeg = np.round(np.array(marks.tbeg, ndmin=1) / frame_shift).astype("int") + tend = np.round(np.array(marks.tend, ndmin=1) / frame_shift).astype("int") tend[-1] = min(tend[-1], num_frames) - + vad = np.zeros((num_frames,), dtype=int) for j in range(len(tbeg)): - vad[tbeg[j]:tend[j]+1] = 1 + vad[tbeg[j] : tend[j] + 1] = 1 write_vad(fvad, file_id, vad) - def segments2vad(segments_file, num_frames_file, vad_file): - df_segments = pd.read_csv(segments_file, sep='\s+', header=None, - names=['segments_id','file_id','tbeg','tend']) + df_segments = pd.read_csv( + segments_file, + sep="\s+", + header=None, + names=["segments_id", "file_id", "tbeg", "tend"], + ) df_segments.index = df_segments.file_id - - df_num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, - names=['file_id','num_frames']) + + df_num_frames = pd.read_csv( + num_frames_file, sep="\s+", header=None, names=["file_id", "num_frames"] + ) df_num_frames.index = df_num_frames.file_id - with open(vad_file, 'w') as fvad: + with open(vad_file, "w") as fvad: for file_id in df_num_frames.file_id: print(file_id) num_frames_i = int(df_num_frames.num_frames.loc[file_id]) if file_id in df_segments.index: df_segments_i = df_segments.loc[file_id] - #print(df_segments_i) + # print(df_segments_i) segments2vad_file(file_id, df_segments_i, num_frames_i, fvad) else: - print('Empty file %s' % file_id) - + print("Empty file %s" % file_id) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Converts Vimal VAD segments to kaldi VAD files') - - parser.add_argument('--segments',dest='segments_file', required=True) - parser.add_argument('--num-frames', dest='num_frames_file', required=True) - parser.add_argument('--vad-file', dest='vad_file', required=True) - args=parser.parse_args() - + fromfile_prefix_chars="@", + description="Converts Vimal VAD segments to kaldi VAD files", + ) + + parser.add_argument("--segments", dest="segments_file", required=True) + parser.add_argument("--num-frames", dest="num_frames_file", required=True) + parser.add_argument("--vad-file", dest="vad_file", required=True) + args = parser.parse_args() + segments2vad(**vars(args)) - diff --git a/egs/sre18/v1.8k/steps_kaldi_diar/make_rttm.py b/egs/sre18/v1.8k/steps_kaldi_diar/make_rttm.py index cc1145ab..ace8f8e6 100755 --- a/egs/sre18/v1.8k/steps_kaldi_diar/make_rttm.py +++ b/egs/sre18/v1.8k/steps_kaldi_diar/make_rttm.py @@ -35,96 +35,112 @@ import argparse import sys -sys.path.append('steps/libs') +sys.path.append("steps/libs") import common as common_lib def get_args(): - parser = argparse.ArgumentParser( - description="""This script converts a segments and labels file + parser = argparse.ArgumentParser( + description="""This script converts a segments and labels file to a NIST RTTM file. It handles overlapping segments (e.g. the - output of a sliding-window diarization system).""") + output of a sliding-window diarization system).""" + ) - parser.add_argument("segments", type=str, - help="Input segments file") - parser.add_argument("labels", type=str, - help="Input labels file") - parser.add_argument("rttm_file", type=str, - help="Output RTTM file") - parser.add_argument("--rttm-channel", type=int, default=0, - help="The value passed into the RTTM channel field. \ - Only affects the format of the RTTM file.") + parser.add_argument("segments", type=str, help="Input segments file") + parser.add_argument("labels", type=str, help="Input labels file") + parser.add_argument("rttm_file", type=str, help="Output RTTM file") + parser.add_argument( + "--rttm-channel", + type=int, + default=0, + help="The value passed into the RTTM channel field. \ + Only affects the format of the RTTM file.", + ) + + args = parser.parse_args() + return args - args = parser.parse_args() - return args def main(): - args = get_args() - - # File containing speaker labels per segment - seg2label = {} - with common_lib.smart_open(args.labels) as labels_file: - for line in labels_file: - seg, label = line.strip().split() - seg2label[seg] = label - - # Segments file - reco2segs = {} - with common_lib.smart_open(args.segments) as segments_file: - for line in segments_file: - seg, reco, start, end = line.strip().split() - try: - if reco in reco2segs: - reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg] - else: - reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg] - except KeyError: - raise RuntimeError("Missing label for segment {0}".format(seg)) - - # Cut up overlapping segments so they are contiguous - contiguous_segs = [] - for reco in sorted(reco2segs): - segs = reco2segs[reco].strip().split() - new_segs = "" - for i in range(1, len(segs)-1): - start, end, label = segs[i].split(',') - next_start, next_end, next_label = segs[i+1].split(',') - if float(end) > float(next_start): - done = False - avg = str((float(next_start) + float(end)) / 2.0) - segs[i+1] = ','.join([avg, next_end, next_label]) - new_segs += " " + start + "," + avg + "," + label - else: + args = get_args() + + # File containing speaker labels per segment + seg2label = {} + with common_lib.smart_open(args.labels) as labels_file: + for line in labels_file: + seg, label = line.strip().split() + seg2label[seg] = label + + # Segments file + reco2segs = {} + with common_lib.smart_open(args.segments) as segments_file: + for line in segments_file: + seg, reco, start, end = line.strip().split() + try: + if reco in reco2segs: + reco2segs[reco] = ( + reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg] + ) + else: + reco2segs[reco] = ( + reco + " " + start + "," + end + "," + seg2label[seg] + ) + except KeyError: + raise RuntimeError("Missing label for segment {0}".format(seg)) + + # Cut up overlapping segments so they are contiguous + contiguous_segs = [] + for reco in sorted(reco2segs): + segs = reco2segs[reco].strip().split() + new_segs = "" + for i in range(1, len(segs) - 1): + start, end, label = segs[i].split(",") + next_start, next_end, next_label = segs[i + 1].split(",") + if float(end) > float(next_start): + done = False + avg = str((float(next_start) + float(end)) / 2.0) + segs[i + 1] = ",".join([avg, next_end, next_label]) + new_segs += " " + start + "," + avg + "," + label + else: + new_segs += " " + start + "," + end + "," + label + start, end, label = segs[-1].split(",") new_segs += " " + start + "," + end + "," + label - start, end, label = segs[-1].split(',') - new_segs += " " + start + "," + end + "," + label - contiguous_segs.append(reco + new_segs) - - # Merge contiguous segments of the same label - merged_segs = [] - for reco_line in contiguous_segs: - segs = reco_line.strip().split() - reco = segs[0] - new_segs = "" - for i in range(1, len(segs)-1): - start, end, label = segs[i].split(',') - next_start, next_end, next_label = segs[i+1].split(',') - if float(end) == float(next_start) and label == next_label: - segs[i+1] = ','.join([start, next_end, next_label]) - else: + contiguous_segs.append(reco + new_segs) + + # Merge contiguous segments of the same label + merged_segs = [] + for reco_line in contiguous_segs: + segs = reco_line.strip().split() + reco = segs[0] + new_segs = "" + for i in range(1, len(segs) - 1): + start, end, label = segs[i].split(",") + next_start, next_end, next_label = segs[i + 1].split(",") + if float(end) == float(next_start) and label == next_label: + segs[i + 1] = ",".join([start, next_end, next_label]) + else: + new_segs += " " + start + "," + end + "," + label + start, end, label = segs[-1].split(",") new_segs += " " + start + "," + end + "," + label - start, end, label = segs[-1].split(',') - new_segs += " " + start + "," + end + "," + label - merged_segs.append(reco + new_segs) - - with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: - for reco_line in merged_segs: - segs = reco_line.strip().split() - reco = segs[0] - for i in range(1, len(segs)): - start, end, label = segs[i].strip().split(',') - print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} {4} ".format( - reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer) - -if __name__ == '__main__': - main() + merged_segs.append(reco + new_segs) + + with common_lib.smart_open(args.rttm_file, "w") as rttm_writer: + for reco_line in merged_segs: + segs = reco_line.strip().split() + reco = segs[0] + for i in range(1, len(segs)): + start, end, label = segs[i].strip().split(",") + print( + "SPEAKER {0} {1} {2:7.3f} {3:7.3f} {4} ".format( + reco, + args.rttm_channel, + float(start), + float(end) - float(start), + label, + ), + file=rttm_writer, + ) + + +if __name__ == "__main__": + main() diff --git a/egs/sre18/v1.8k/steps_kaldi_xvec/allocate_egs.py b/egs/sre18/v1.8k/steps_kaldi_xvec/allocate_egs.py index 72a4572d..e4b58c68 100755 --- a/egs/sre18/v1.8k/steps_kaldi_xvec/allocate_egs.py +++ b/egs/sre18/v1.8k/steps_kaldi_xvec/allocate_egs.py @@ -67,51 +67,97 @@ from __future__ import print_function import re, os, argparse, sys, math, warnings, random + def get_args(): - parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files " - "in preparation for dumping egs for xvector training.", - epilog="Called by sid/nnet3/xvector/get_egs.sh") - parser.add_argument("--prefix", type=str, default="", - help="Adds a prefix to the output files. This is used to distinguish between the train " - "and diagnostic files.") - parser.add_argument("--num-repeats", type=int, default=10, help="Number of times each speaker repeats within an archive.") - parser.add_argument("--min-frames-per-chunk", type=int, default=50, - help="Minimum number of frames-per-chunk used for any archive") - parser.add_argument("--max-frames-per-chunk", type=int, default=300, - help="Maximum number of frames-per-chunk used for any archive") - parser.add_argument("--randomize-chunk-length", type=str, - help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]." - "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk" - "according to a geometric sequence.", - default="true", choices = ["false", "true"]) - parser.add_argument("--frames-per-iter", type=int, default=1000000, - help="Target number of frames for each archive") - parser.add_argument("--num-archives", type=int, default=-1, - help="Number of archives to write"); - parser.add_argument("--num-jobs", type=int, default=-1, - help="Number of jobs we're going to use to write the archives; the ranges.* " - "and outputs.* files are indexed by job. Must be <= the --num-archives option."); - parser.add_argument("--seed", type=int, default=123, - help="Seed for random number generator") - parser.add_argument("--num-pdfs", type=int, default=-1, - help="Num pdfs") + parser = argparse.ArgumentParser( + description="Writes ranges.*, outputs.* and archive_chunk_lengths files " + "in preparation for dumping egs for xvector training.", + epilog="Called by sid/nnet3/xvector/get_egs.sh", + ) + parser.add_argument( + "--prefix", + type=str, + default="", + help="Adds a prefix to the output files. This is used to distinguish between the train " + "and diagnostic files.", + ) + parser.add_argument( + "--num-repeats", + type=int, + default=10, + help="Number of times each speaker repeats within an archive.", + ) + parser.add_argument( + "--min-frames-per-chunk", + type=int, + default=50, + help="Minimum number of frames-per-chunk used for any archive", + ) + parser.add_argument( + "--max-frames-per-chunk", + type=int, + default=300, + help="Maximum number of frames-per-chunk used for any archive", + ) + parser.add_argument( + "--randomize-chunk-length", + type=str, + help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]." + "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk" + "according to a geometric sequence.", + default="true", + choices=["false", "true"], + ) + parser.add_argument( + "--frames-per-iter", + type=int, + default=1000000, + help="Target number of frames for each archive", + ) + parser.add_argument( + "--num-archives", type=int, default=-1, help="Number of archives to write" + ) + parser.add_argument( + "--num-jobs", + type=int, + default=-1, + help="Number of jobs we're going to use to write the archives; the ranges.* " + "and outputs.* files are indexed by job. Must be <= the --num-archives option.", + ) + parser.add_argument( + "--seed", type=int, default=123, help="Seed for random number generator" + ) + parser.add_argument("--num-pdfs", type=int, default=-1, help="Num pdfs") # now the positional arguments - parser.add_argument("--utt2len-filename", type=str, required=True, - help="utt2len file of the features to be used as input (format is: " - " )"); - parser.add_argument("--utt2int-filename", type=str, required=True, - help="utt2int file of the features to be used as input (format is: " - " )"); - parser.add_argument("--egs-dir", type=str, required=True, - help="Name of egs directory, e.g. exp/xvector_a/egs"); - - print(' '.join(sys.argv), file=sys.stderr) + parser.add_argument( + "--utt2len-filename", + type=str, + required=True, + help="utt2len file of the features to be used as input (format is: " + " )", + ) + parser.add_argument( + "--utt2int-filename", + type=str, + required=True, + help="utt2int file of the features to be used as input (format is: " + " )", + ) + parser.add_argument( + "--egs-dir", + type=str, + required=True, + help="Name of egs directory, e.g. exp/xvector_a/egs", + ) + + print(" ".join(sys.argv), file=sys.stderr) print(sys.argv, file=sys.stderr) args = parser.parse_args() args = process_args(args) return args + def process_args(args): if args.num_repeats < 1: raise Exception("--num-repeats should have a minimum value of 1") @@ -131,6 +177,7 @@ def process_args(args): raise Exception("--num-jobs is invalid (must not exceed num-archives)") return args + # Create utt2len def get_utt2len(utt2len_filename): utt2len = {} @@ -148,6 +195,7 @@ def get_utt2len(utt2len_filename): return utt2len # Done utt2len + # Handle utt2int, create spk2utt, spks def get_labels(utt2int_filename): f = open(utt2int_filename, "r") @@ -177,29 +225,37 @@ def get_labels(utt2int_filename): def get_random_utt(spkr, spk2utt, min_length): this_utts = spk2utt[spkr] this_num_utts = len(this_utts) - i = random.randint(0, this_num_utts-1) + i = random.randint(0, this_num_utts - 1) utt = this_utts[i] return utt + def random_chunk_length(min_frames_per_chunk, max_frames_per_chunk): ans = random.randint(min_frames_per_chunk, max_frames_per_chunk) return ans + # This function returns an integer in the range # [min-frames-per-chunk, max-frames-per-chunk] according to a geometric # sequence. For example, suppose min-frames-per-chunk is 50, # max-frames-per-chunk is 200, and args.num_archives is 3. Then the # lengths for archives 0, 1, and 2 will be 50, 100, and 200. -def deterministic_chunk_length(archive_id, num_archives, min_frames_per_chunk, max_frames_per_chunk): - if max_frames_per_chunk == min_frames_per_chunk: - return max_frames_per_chunk - elif num_archives == 1: - return int(max_frames_per_chunk); - else: - return int(math.pow(float(max_frames_per_chunk) / - min_frames_per_chunk, float(archive_id) / - (num_archives-1)) * min_frames_per_chunk + 0.5) - +def deterministic_chunk_length( + archive_id, num_archives, min_frames_per_chunk, max_frames_per_chunk +): + if max_frames_per_chunk == min_frames_per_chunk: + return max_frames_per_chunk + elif num_archives == 1: + return int(max_frames_per_chunk) + else: + return int( + math.pow( + float(max_frames_per_chunk) / min_frames_per_chunk, + float(archive_id) / (num_archives - 1), + ) + * min_frames_per_chunk + + 0.5 + ) # given an utterance length utt_length (in frames) and two desired chunk lengths @@ -229,7 +285,7 @@ def main(): # frames in examples of that archive. archive_chunk_lengths = [] # all_egs contains 2-tuples of the form (utt-id, offset) - all_egs= [] + all_egs = [] prefix = "" if args.prefix != "": @@ -237,18 +293,29 @@ def main(): info_f = open(args.egs_dir + "/temp/" + prefix + "archive_chunk_lengths", "w") if info_f is None: - sys.exit(str("Error opening file {0}/temp/" + prefix + "archive_chunk_lengths").format(args.egs_dir)); + sys.exit( + str( + "Error opening file {0}/temp/" + prefix + "archive_chunk_lengths" + ).format(args.egs_dir) + ) for archive_index in range(args.num_archives): print("Processing archive {0}".format(archive_index + 1)) if args.randomize_chunk_length == "true": # don't constrain the lengths to be the same - length = random_chunk_length(args.min_frames_per_chunk, args.max_frames_per_chunk) + length = random_chunk_length( + args.min_frames_per_chunk, args.max_frames_per_chunk + ) else: - length = deterministic_chunk_length(archive_index, args.num_archives, args.min_frames_per_chunk, args.max_frames_per_chunk); + length = deterministic_chunk_length( + archive_index, + args.num_archives, + args.min_frames_per_chunk, + args.max_frames_per_chunk, + ) print("{0} {1}".format(archive_index + 1, length), file=info_f) archive_chunk_lengths.append(length) this_num_egs = int((args.frames_per_iter / length) + 1) - this_egs = [ ] # A 2-tuple of the form (utt-id, start-frame) + this_egs = [] # A 2-tuple of the form (utt-id, start-frame) spkrs = args.num_repeats * list(spk2utt.keys()) random.shuffle(spkrs) for n in range(this_num_egs): @@ -259,14 +326,16 @@ def main(): utt = get_random_utt(spkr, spk2utt, length) utt_len = utt2len[utt] offset = get_random_offset(utt_len, length) - this_egs.append( (utt, offset) ) + this_egs.append((utt, offset)) all_egs.append(this_egs) info_f.close() # work out how many archives we assign to each job in an equitable way. - num_archives_per_job = [ 0 ] * args.num_jobs + num_archives_per_job = [0] * args.num_jobs for i in range(0, args.num_archives): - num_archives_per_job[i % args.num_jobs] = num_archives_per_job[i % args.num_jobs] + 1 + num_archives_per_job[i % args.num_jobs] = ( + num_archives_per_job[i % args.num_jobs] + 1 + ) pdf2num = {} cur_archive = 0 @@ -278,48 +347,80 @@ def main(): for i in range(0, this_num_archives): this_archives_for_job.append(cur_archive) for (utterance_index, offset) in all_egs[cur_archive]: - this_ranges.append( (utterance_index, i, offset) ) + this_ranges.append((utterance_index, i, offset)) cur_archive = cur_archive + 1 f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w") if f is None: - sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1)) + sys.exit( + "Error opening file " + + args.egs_dir + + "/temp/" + + prefix + + "ranges." + + str(job + 1) + ) for (utterance_index, i, offset) in sorted(this_ranges): archive_index = this_archives_for_job[i] - print("{0} {1} {2} {3} {4} {5}".format(utterance_index, - i, - archive_index + 1, - offset, - archive_chunk_lengths[archive_index], - utt2spk[utterance_index]), - file=f) + print( + "{0} {1} {2} {3} {4} {5}".format( + utterance_index, + i, + archive_index + 1, + offset, + archive_chunk_lengths[archive_index], + utt2spk[utterance_index], + ), + file=f, + ) if utt2spk[utterance_index] in pdf2num: - pdf2num[utt2spk[utterance_index]] += 1 + pdf2num[utt2spk[utterance_index]] += 1 else: pdf2num[utt2spk[utterance_index]] = 1 f.close() - f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w") if f is None: - sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1)) - print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]), - file=f) + sys.exit( + "Error opening file " + + args.egs_dir + + "/temp/" + + prefix + + "outputs." + + str(job + 1) + ) + print( + " ".join( + [ + str("{0}/" + prefix + "egs_temp.{1}.ark").format( + args.egs_dir, n + 1 + ) + for n in this_archives_for_job + ] + ), + file=f, + ) f.close() f = open(args.egs_dir + "/" + prefix + "pdf2num", "w") nums = [] for k in range(0, args.num_pdfs): if k in pdf2num: - nums.append(pdf2num[k]) + nums.append(pdf2num[k]) else: - nums.append(0) + nums.append(0) print(" ".join(map(str, nums)), file=f) f.close() - print("allocate_egs.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files") + print( + "allocate_egs.py: finished generating " + + prefix + + "ranges.* and " + + prefix + + "outputs.* files" + ) + if __name__ == "__main__": main() - diff --git a/egs/sre19-av-v/v0.1/local/score_dcf.py b/egs/sre19-av-v/v0.1/local/score_dcf.py index 4fffc9e8..514ebf51 100755 --- a/egs/sre19-av-v/v0.1/local/score_dcf.py +++ b/egs/sre19-av-v/v0.1/local/score_dcf.py @@ -24,48 +24,54 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0]) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py b/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py index 96381825..8087cac2 100755 --- a/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/eval-calibration-v1.py @@ -28,45 +28,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py index 0ab3ff38..025d11a3 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_be_utils.py @@ -17,14 +17,15 @@ def lnorm(x): - mx = np.sqrt(np.sum(x**2, axis=1, keepdims=True)) + 1e-10 - return x/mx + mx = np.sqrt(np.sum(x ** 2, axis=1, keepdims=True)) + 1e-10 + return x / mx + def cosine_scr(x1, x2): - - #t = LNorm() - #x1 = t.predict(x1) - #x2 = t.predict(x2) + + # t = LNorm() + # x1 = t.predict(x1) + # x2 = t.predict(x2) x1 = lnorm(x1) x2 = lnorm(x2) return np.dot(x1, x2.T) @@ -33,7 +34,7 @@ def cosine_scr(x1, x2): def fill_missing_ref_with_facedet_avg(x_ref, x_e, seg_names): assert len(x_ref) == len(x_e) - #get embed dim + # get embed dim for i in range(len(x_e)): if x_e[i].shape[0] > 0: x_dim = x_e[i].shape[1] @@ -42,10 +43,16 @@ def fill_missing_ref_with_facedet_avg(x_ref, x_e, seg_names): for i in range(len(x_ref)): if x_ref[i].shape[0] == 0: if x_e[i].shape[0] > 0: - logging.warning('Empty reference for enroll %s, we put the average of faces in enroll file' % (seg_names[i])) + logging.warning( + "Empty reference for enroll %s, we put the average of faces in enroll file" + % (seg_names[i]) + ) x_ref[i] = np.mean(x_e[i], axis=0, keepdims=True) else: - logging.warning('Empty reference for enroll %s, we use zero vector, no faces were detected in enroll file ' % (seg_names[i])) + logging.warning( + "Empty reference for enroll %s, we use zero vector, no faces were detected in enroll file " + % (seg_names[i]) + ) x_ref[i] = np.zeros((1, x_dim)) return x_ref @@ -53,7 +60,7 @@ def fill_missing_ref_with_facedet_avg(x_ref, x_e, seg_names): def fill_missing_test_with_zero(x_t, seg_names): - #get embed dim + # get embed dim for i in range(len(x_t)): if x_t[i].shape[0] > 0: x_dim = x_t[i].shape[1] @@ -61,19 +68,17 @@ def fill_missing_test_with_zero(x_t, seg_names): for i in range(len(x_t)): if x_t[i].shape[0] == 0: - logging.warning('Empty test %s, we use zero vector ' % (seg_names[i])) + logging.warning("Empty test %s, we use zero vector " % (seg_names[i])) x_t[i] = np.zeros((1, x_dim)) return x_t - - def concat_embed_matrices(x): seg_idx = [] for i in range(len(x)): - seg_idx_i = i*np.ones((x[i].shape[0],), dtype=np.int) + seg_idx_i = i * np.ones((x[i].shape[0],), dtype=np.int) seg_idx.append(seg_idx_i) seg_idx = np.concatenate(tuple(seg_idx)) @@ -84,26 +89,26 @@ def concat_embed_matrices(x): def max_combine_scores_1vsM(scores_in, test_idx): num_test = np.max(test_idx) + 1 - scores_out = np.zeros((scores_in.shape[0],num_test)) + scores_out = np.zeros((scores_in.shape[0], num_test)) for j in range(num_test): idx = test_idx == j scores_j = scores_in[:, idx] scores_j = np.max(scores_j, axis=1) - scores_out[:,j] = scores_j + scores_out[:, j] = scores_j return scores_out - + def max_combine_scores_NvsM(scores, enr_idx, test_idx): max_scores_cols = max_combine_scores_1vsM(scores, test_idx) - max_scores_trans = max_combine_scores_1vsM(max_scores_cols.T, enr_idx) + max_scores_trans = max_combine_scores_1vsM(max_scores_cols.T, enr_idx) return max_scores_trans.T - + def read_cohort(v_file, coh_list): r = DRF.create(v_file) - coh = Utt2Info.load(coh_list, sep=' ') + coh = Utt2Info.load(coh_list, sep=" ") x = r.read(coh.key, squeeze=False) for i in range(len(x)): print(x[i].shape) @@ -131,7 +136,7 @@ def compute_median_per_vid(x): def cluster_embeds_ahc(x, thr): - ahc = AHC(method='average', metric='llr') + ahc = AHC(method="average", metric="llr") x_clusters = [] for i in range(len(x)): x_i = x[i] @@ -141,21 +146,20 @@ def cluster_embeds_ahc(x, thr): scores = cosine_scr(x_i, x_i) ahc.fit(scores) - class_ids = ahc.get_flat_clusters(thr, criterion='threshold') + class_ids = ahc.get_flat_clusters(thr, criterion="threshold") x_dim = x_i.shape[1] num_classes = np.max(class_ids) + 1 - logging.info('AHC file %d from %d -> %d' % (i, x_i.shape[0], num_classes)) + logging.info("AHC file %d from %d -> %d" % (i, x_i.shape[0], num_classes)) x_clusters_i = np.zeros((num_classes, x_dim)) for j in range(num_classes): idx = class_ids == j x_clusters_i[j] = np.mean(x_i[idx], axis=0) - + x_clusters.append(x_clusters_i) return x_clusters - def compute_self_att_embeds(x, a): x_att = [] @@ -165,7 +169,7 @@ def compute_self_att_embeds(x, a): x_att.append(x_i) continue - scores = a*cosine_scr(x_i, x_i) + scores = a * cosine_scr(x_i, x_i) p_att = softmax(scores, axis=1) x_att_i = np.dot(p_att, x_i) x_att.append(x_att_i) @@ -183,11 +187,9 @@ def compute_att_test_embeds(x_e, x_t, a): continue print(x_i.shape) - scores = a*cosine_scr(x_e, x_i) + scores = a * cosine_scr(x_e, x_i) p_att = softmax(scores, axis=1) x_att_i = np.dot(p_att, x_i) x_att.append(x_att_i) return x_att - - diff --git a/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py b/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py index 9903582a..091a4ee1 100644 --- a/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py +++ b/egs/sre19-av-v/v0.1/steps_be/face_video_trial_data_reader.py @@ -20,15 +20,28 @@ from hyperion.utils import TrialNdx, TrialKey from hyperion.transforms import TransformList + class FaceVideoTrialDataReaderV1(object): """ Loads Ndx, enroll file and x-vectors to evaluate PLDA for face reco in videos. """ - def __init__(self, ref_v_file, enr_v_file, test_v_file, ndx_file, enroll_file, test_file, - preproc=None, tlist_sep=' ', - model_idx=1, num_model_parts=1, seg_idx=1, num_seg_parts=1, - eval_set='enroll-test'): + def __init__( + self, + ref_v_file, + enr_v_file, + test_v_file, + ndx_file, + enroll_file, + test_file, + preproc=None, + tlist_sep=" ", + model_idx=1, + num_model_parts=1, + seg_idx=1, + num_seg_parts=1, + eval_set="enroll-test", + ): if ref_v_file is None: self.r_ref = None @@ -58,7 +71,6 @@ def __init__(self, ref_v_file, enr_v_file, test_v_file, ndx_file, enroll_file, t self.enroll = enroll self.ndx = ndx - def read(self): if self.r_ref is None: @@ -70,9 +82,9 @@ def read(self): x_e = None else: x_e = self.r_enr.read(self.enroll.key, squeeze=False) - + x_t = self.r_test.read(self.ndx.seg_set, squeeze=False) - + if self.preproc is not None: if x_ref is not None: x_ref = self.preproc.predict(x_ref) @@ -80,49 +92,87 @@ def read(self): x_e = self.preproc.predict(x_e) x_t = self.preproc.predict(x_t) - return x_ref, x_e, x_t, self.enroll.info, self.ndx, - - + return ( + x_ref, + x_e, + x_t, + self.enroll.info, + self.ndx, + ) @staticmethod def filter_args(prefix=None, **kwargs): if prefix is None: - p = '' + p = "" else: - p = prefix + '_' - valid_args = ('tlist_sep', - 'model_idx','num_model_parts', - 'seg_idx', 'num_seg_parts', - 'eval_set') - return dict((k, kwargs[p+k]) - for k in valid_args if p+k in kwargs) - - + p = prefix + "_" + valid_args = ( + "tlist_sep", + "model_idx", + "num_model_parts", + "seg_idx", + "num_seg_parts", + "eval_set", + ) + return dict((k, kwargs[p + k]) for k in valid_args if p + k in kwargs) + @staticmethod def add_argparse_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '-' - p2 = prefix + '_' - parser.add_argument(p1+'tlist-sep', dest=(p2+'tlist_sep'), default=' ', - help=('trial lists field separator')) + p1 = "--" + prefix + "-" + p2 = prefix + "_" + parser.add_argument( + p1 + "tlist-sep", + dest=(p2 + "tlist_sep"), + default=" ", + help=("trial lists field separator"), + ) # parser.add_argument(p1+'v-field', dest=(p2+'v_field'), default='', # help=('dataset field in the data file')) - parser.add_argument(p1+'model-part-idx', dest=(p2+'model_idx'), default=1, type=int, - help=('model part index')) - parser.add_argument(p1+'num-model-parts', dest=(p2+'num_model_parts'), default=1, type=int, - help=('number of parts in which we divide the model' - 'list to run evaluation in parallel')) - parser.add_argument(p1+'seg-part-idx', dest=(p2+'seg_idx'), default=1, type=int, - help=('test part index')) - parser.add_argument(p1+'num-seg-parts', dest=(p2+'num_seg_parts'), default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument(p1+'eval-set', dest=(p2+'eval_set'), type=str.lower, - default='enroll-test', - choices=['enroll-test','enroll-coh','coh-test','coh-coh'], - help=('evaluation subset')) + parser.add_argument( + p1 + "model-part-idx", + dest=(p2 + "model_idx"), + default=1, + type=int, + help=("model part index"), + ) + parser.add_argument( + p1 + "num-model-parts", + dest=(p2 + "num_model_parts"), + default=1, + type=int, + help=( + "number of parts in which we divide the model" + "list to run evaluation in parallel" + ), + ) + parser.add_argument( + p1 + "seg-part-idx", + dest=(p2 + "seg_idx"), + default=1, + type=int, + help=("test part index"), + ) + parser.add_argument( + p1 + "num-seg-parts", + dest=(p2 + "num_seg_parts"), + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument( + p1 + "eval-set", + dest=(p2 + "eval_set"), + type=str.lower, + default="enroll-test", + choices=["enroll-test", "enroll-coh", "coh-test", "coh-coh"], + help=("evaluation subset"), + ) diff --git a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py index 9f4650ae..35c1a3bc 100755 --- a/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py +++ b/egs/sre19-av-v/v0.1/steps_be/train-calibration-v1.py @@ -27,64 +27,71 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) - logging.info('train calibration') + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) - lr = LR(prior=prior, lambda_reg=lambda_reg, bias_scaling=1, solver='liblinear', verbose=verbose) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('--lambda-reg', dest='lambda_reg', type=float, - default=1e-5) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/benchmark.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/benchmark.py index 42773b07..fda7a2e2 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/benchmark.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/benchmark.py @@ -4,23 +4,29 @@ import numpy as np import datetime -parser = argparse.ArgumentParser(description='face model test') +parser = argparse.ArgumentParser(description="face model test") # general -parser.add_argument('--image-size', default='112,112', help='') -parser.add_argument('--model', default='../models/model-r34-amf/model,0', help='path to load model.') -parser.add_argument('--gpu', default=0, type=int, help='gpu id') -parser.add_argument('--det', default=2, type=int, help='mtcnn option, 2 means using R+O, else using O') -parser.add_argument('--flip', default=0, type=int, help='whether do lr flip aug') -parser.add_argument('--threshold', default=1.24, type=float, help='ver dist threshold') +parser.add_argument("--image-size", default="112,112", help="") +parser.add_argument( + "--model", default="../models/model-r34-amf/model,0", help="path to load model." +) +parser.add_argument("--gpu", default=0, type=int, help="gpu id") +parser.add_argument( + "--det", default=2, type=int, help="mtcnn option, 2 means using R+O, else using O" +) +parser.add_argument("--flip", default=0, type=int, help="whether do lr flip aug") +parser.add_argument("--threshold", default=1.24, type=float, help="ver dist threshold") args = parser.parse_args() model = face_embedding.FaceModel(args) -#img = cv2.imread('/raid5data/dplearn/lfw/Jude_Law/Jude_Law_0001.jpg') -img = cv2.imread('/raid5data/dplearn/megaface/facescrubr/112x112/Tom_Hanks/Tom_Hanks_54745.png') +# img = cv2.imread('/raid5data/dplearn/lfw/Jude_Law/Jude_Law_0001.jpg') +img = cv2.imread( + "/raid5data/dplearn/megaface/facescrubr/112x112/Tom_Hanks/Tom_Hanks_54745.png" +) time_now = datetime.datetime.now() for i in range(3000): - f1 = model.get_feature(img) + f1 = model.get_feature(img) time_now2 = datetime.datetime.now() diff = time_now2 - time_now -print(diff.total_seconds()/3000) +print(diff.total_seconds() / 3000) diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/face_embedding.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/face_embedding.py index ae2dd1c0..bf1e632b 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/face_embedding.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/face_embedding.py @@ -16,79 +16,85 @@ from time import sleep from easydict import EasyDict as edict from mtcnn_detector import MtcnnDetector -sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'src', 'common')) + +sys.path.append(os.path.join(os.path.dirname(__file__), "..", "src", "common")) import face_image import face_preprocess def do_flip(data): - for idx in range(data.shape[0]): - data[idx,:,:] = np.fliplr(data[idx,:,:]) - -class FaceModel: - def __init__(self, args): - self.args = args - model = edict() + for idx in range(data.shape[0]): + data[idx, :, :] = np.fliplr(data[idx, :, :]) - self.threshold = args.threshold - self.det_minsize = 50 - self.det_threshold = [0.4,0.6,0.6] - self.det_factor = 0.9 - _vec = args.image_size.split(',') - assert len(_vec)==2 - image_size = (int(_vec[0]), int(_vec[1])) - self.image_size = image_size - _vec = args.model.split(',') - assert len(_vec)==2 - prefix = _vec[0] - epoch = int(_vec[1]) - print('loading',prefix, epoch) - ctx = mx.gpu(args.gpu) - sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) - all_layers = sym.get_internals() - sym = all_layers['fc1_output'] - model = mx.mod.Module(symbol=sym, context=ctx, label_names = None) - #model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))]) - model.bind(data_shapes=[('data', (1, 3, image_size[0], image_size[1]))]) - model.set_params(arg_params, aux_params) - self.model = model - mtcnn_path = os.path.join(os.path.dirname(__file__), 'mtcnn-model') - detector = MtcnnDetector(model_folder=mtcnn_path, ctx=ctx, num_worker=1, accurate_landmark = True, threshold=[0.0,0.0,0.2]) - self.detector = detector +class FaceModel: + def __init__(self, args): + self.args = args + model = edict() - def get_feature(self, face_img): - #face_img is bgr image - ret = self.detector.detect_face_limited(face_img, det_type = self.args.det) - if ret is None: - return None - bbox, points = ret - if bbox.shape[0]==0: - return None - bbox = bbox[0,0:4] - points = points[0,:].reshape((2,5)).T - #print(bbox) - #print(points) - nimg = face_preprocess.preprocess(face_img, bbox, points, image_size='112,112') - nimg = cv2.cvtColor(nimg, cv2.COLOR_BGR2RGB) - aligned = np.transpose(nimg, (2,0,1)) - #print(nimg.shape) - embedding = None - for flipid in [0,1]: - if flipid==1: - if self.args.flip==0: - break - do_flip(aligned) - input_blob = np.expand_dims(aligned, axis=0) - data = mx.nd.array(input_blob) - db = mx.io.DataBatch(data=(data,)) - self.model.forward(db, is_train=False) - _embedding = self.model.get_outputs()[0].asnumpy() - #print(_embedding.shape) - if embedding is None: - embedding = _embedding - else: - embedding += _embedding - embedding = sklearn.preprocessing.normalize(embedding).flatten() - return embedding + self.threshold = args.threshold + self.det_minsize = 50 + self.det_threshold = [0.4, 0.6, 0.6] + self.det_factor = 0.9 + _vec = args.image_size.split(",") + assert len(_vec) == 2 + image_size = (int(_vec[0]), int(_vec[1])) + self.image_size = image_size + _vec = args.model.split(",") + assert len(_vec) == 2 + prefix = _vec[0] + epoch = int(_vec[1]) + print("loading", prefix, epoch) + ctx = mx.gpu(args.gpu) + sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) + all_layers = sym.get_internals() + sym = all_layers["fc1_output"] + model = mx.mod.Module(symbol=sym, context=ctx, label_names=None) + # model.bind(data_shapes=[('data', (args.batch_size, 3, image_size[0], image_size[1]))], label_shapes=[('softmax_label', (args.batch_size,))]) + model.bind(data_shapes=[("data", (1, 3, image_size[0], image_size[1]))]) + model.set_params(arg_params, aux_params) + self.model = model + mtcnn_path = os.path.join(os.path.dirname(__file__), "mtcnn-model") + detector = MtcnnDetector( + model_folder=mtcnn_path, + ctx=ctx, + num_worker=1, + accurate_landmark=True, + threshold=[0.0, 0.0, 0.2], + ) + self.detector = detector + def get_feature(self, face_img): + # face_img is bgr image + ret = self.detector.detect_face_limited(face_img, det_type=self.args.det) + if ret is None: + return None + bbox, points = ret + if bbox.shape[0] == 0: + return None + bbox = bbox[0, 0:4] + points = points[0, :].reshape((2, 5)).T + # print(bbox) + # print(points) + nimg = face_preprocess.preprocess(face_img, bbox, points, image_size="112,112") + nimg = cv2.cvtColor(nimg, cv2.COLOR_BGR2RGB) + aligned = np.transpose(nimg, (2, 0, 1)) + # print(nimg.shape) + embedding = None + for flipid in [0, 1]: + if flipid == 1: + if self.args.flip == 0: + break + do_flip(aligned) + input_blob = np.expand_dims(aligned, axis=0) + data = mx.nd.array(input_blob) + db = mx.io.DataBatch(data=(data,)) + self.model.forward(db, is_train=False) + _embedding = self.model.get_outputs()[0].asnumpy() + # print(_embedding.shape) + if embedding is None: + embedding = _embedding + else: + embedding += _embedding + embedding = sklearn.preprocessing.normalize(embedding).flatten() + return embedding diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/ga_merge.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/ga_merge.py index d6397f0d..23a6c0b0 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/ga_merge.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/ga_merge.py @@ -8,11 +8,11 @@ import numpy as np import mxnet as mx -parser = argparse.ArgumentParser(description='merge age and gender models') +parser = argparse.ArgumentParser(description="merge age and gender models") # general -parser.add_argument('--age-model', default='', help='path to load age model.') -parser.add_argument('--gender-model', default='', help='path to load gender model.') -parser.add_argument('--prefix', default='', help='path to save model.') +parser.add_argument("--age-model", default="", help="path to load age model.") +parser.add_argument("--gender-model", default="", help="path to load gender model.") +parser.add_argument("--prefix", default="", help="path to save model.") args = parser.parse_args() i = 0 @@ -20,33 +20,32 @@ targ = {} taux = {} for model in [args.age_model, args.gender_model]: - _vec = model.split(',') - assert len(_vec)==2 - prefix = _vec[0] - epoch = int(_vec[1]) - print('loading',prefix, epoch) - sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) - if tsym is None: - all_layers = sym.get_internals() - tsym = all_layers['fc1_output'] - if i==0: - prefix = 'age' - else: - prefix = 'gender' - for k,v in arg_params.iteritems(): - if k.startswith(prefix): - print('arg', i, k) - targ[k] = v - for k,v in aux_params.iteritems(): - if k.startswith(prefix): - print('aux', i, k) - taux[k] = v - i+=1 + _vec = model.split(",") + assert len(_vec) == 2 + prefix = _vec[0] + epoch = int(_vec[1]) + print("loading", prefix, epoch) + sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) + if tsym is None: + all_layers = sym.get_internals() + tsym = all_layers["fc1_output"] + if i == 0: + prefix = "age" + else: + prefix = "gender" + for k, v in arg_params.iteritems(): + if k.startswith(prefix): + print("arg", i, k) + targ[k] = v + for k, v in aux_params.iteritems(): + if k.startswith(prefix): + print("aux", i, k) + taux[k] = v + i += 1 dellist = [] -#for k,v in arg_params.iteritems(): +# for k,v in arg_params.iteritems(): # if k.startswith('fc7'): # dellist.append(k) for d in dellist: - del targ[d] + del targ[d] mx.model.save_checkpoint(args.prefix, 0, tsym, targ, taux) - diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/helper.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/helper.py index b82c4b77..96402629 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/helper.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/helper.py @@ -5,7 +5,7 @@ import numpy as np -def nms(boxes, overlap_threshold, mode='Union'): +def nms(boxes, overlap_threshold, mode="Union"): """ non max suppression @@ -55,17 +55,19 @@ def nms(boxes, overlap_threshold, mode='Union'): h = np.maximum(0, yy2 - yy1 + 1) inter = w * h - if mode == 'Min': + if mode == "Min": overlap = inter / np.minimum(area[i], area[idxs[:last]]) else: overlap = inter / (area[i] + area[idxs[:last]] - inter) # delete all indexes from the index list that have - idxs = np.delete(idxs, np.concatenate(([last], - np.where(overlap > overlap_threshold)[0]))) + idxs = np.delete( + idxs, np.concatenate(([last], np.where(overlap > overlap_threshold)[0])) + ) return pick + def adjust_input(in_data): """ adjust the input from (h, w, c) to ( 1, c, h, w) for network input @@ -79,60 +81,65 @@ def adjust_input(in_data): out_data: numpy array of shape (1, c, h, w) reshaped array """ - if in_data.dtype is not np.dtype('float32'): + if in_data.dtype is not np.dtype("float32"): out_data = in_data.astype(np.float32) else: out_data = in_data - out_data = out_data.transpose((2,0,1)) + out_data = out_data.transpose((2, 0, 1)) out_data = np.expand_dims(out_data, 0) - out_data = (out_data - 127.5)*0.0078125 + out_data = (out_data - 127.5) * 0.0078125 return out_data + def generate_bbox(map, reg, scale, threshold): - """ - generate bbox from feature map - Parameters: - ---------- - map: numpy array , n x m x 1 - detect score for each position - reg: numpy array , n x m x 4 - bbox - scale: float number - scale of this detection - threshold: float number - detect threshold - Returns: - ------- - bbox array - """ - stride = 2 - cellsize = 12 - - t_index = np.where(map>threshold) - - # find nothing - if t_index[0].size == 0: - return np.array([]) - - dx1, dy1, dx2, dy2 = [reg[0, i, t_index[0], t_index[1]] for i in range(4)] - - reg = np.array([dx1, dy1, dx2, dy2]) - score = map[t_index[0], t_index[1]] - boundingbox = np.vstack([np.round((stride*t_index[1]+1)/scale), - np.round((stride*t_index[0]+1)/scale), - np.round((stride*t_index[1]+1+cellsize)/scale), - np.round((stride*t_index[0]+1+cellsize)/scale), - score, - reg]) - - return boundingbox.T + """ + generate bbox from feature map + Parameters: + ---------- + map: numpy array , n x m x 1 + detect score for each position + reg: numpy array , n x m x 4 + bbox + scale: float number + scale of this detection + threshold: float number + detect threshold + Returns: + ------- + bbox array + """ + stride = 2 + cellsize = 12 + + t_index = np.where(map > threshold) + + # find nothing + if t_index[0].size == 0: + return np.array([]) + + dx1, dy1, dx2, dy2 = [reg[0, i, t_index[0], t_index[1]] for i in range(4)] + + reg = np.array([dx1, dy1, dx2, dy2]) + score = map[t_index[0], t_index[1]] + boundingbox = np.vstack( + [ + np.round((stride * t_index[1] + 1) / scale), + np.round((stride * t_index[0] + 1) / scale), + np.round((stride * t_index[1] + 1 + cellsize) / scale), + np.round((stride * t_index[0] + 1 + cellsize) / scale), + score, + reg, + ] + ) + + return boundingbox.T def detect_first_stage(img, net, scale, threshold): """ run PNet for first stage - + Parameters: ---------- img: numpy array, bgr order @@ -148,21 +155,22 @@ def detect_first_stage(img, net, scale, threshold): height, width, _ = img.shape hs = int(math.ceil(height * scale)) ws = int(math.ceil(width * scale)) - - im_data = cv2.resize(img, (ws,hs)) - + + im_data = cv2.resize(img, (ws, hs)) + # adjust for the network input input_buf = adjust_input(im_data) output = net.predict(input_buf) - boxes = generate_bbox(output[1][0,1,:,:], output[0], scale, threshold) + boxes = generate_bbox(output[1][0, 1, :, :], output[0], scale, threshold) if boxes.size == 0: return None # nms - pick = nms(boxes[:,0:5], 0.5, mode='Union') + pick = nms(boxes[:, 0:5], 0.5, mode="Union") boxes = boxes[pick] return boxes -def detect_first_stage_warpper( args ): + +def detect_first_stage_warpper(args): return detect_first_stage(*args) diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/model_slim.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/model_slim.py index 327adde4..a33bd456 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/model_slim.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/model_slim.py @@ -8,24 +8,25 @@ import numpy as np import mxnet as mx -parser = argparse.ArgumentParser(description='face model slim') +parser = argparse.ArgumentParser(description="face model slim") # general -parser.add_argument('--model', default='../models/model-r34-amf/model,60', help='path to load model.') +parser.add_argument( + "--model", default="../models/model-r34-amf/model,60", help="path to load model." +) args = parser.parse_args() -_vec = args.model.split(',') -assert len(_vec)==2 +_vec = args.model.split(",") +assert len(_vec) == 2 prefix = _vec[0] epoch = int(_vec[1]) -print('loading',prefix, epoch) +print("loading", prefix, epoch) sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch) all_layers = sym.get_internals() -sym = all_layers['fc1_output'] +sym = all_layers["fc1_output"] dellist = [] -for k,v in arg_params.iteritems(): - if k.startswith('fc7'): - dellist.append(k) +for k, v in arg_params.iteritems(): + if k.startswith("fc7"): + dellist.append(k) for d in dellist: - del arg_params[d] -mx.model.save_checkpoint(prefix+"s", 0, sym, arg_params, aux_params) - + del arg_params[d] +mx.model.save_checkpoint(prefix + "s", 0, sym, arg_params, aux_params) diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/mtcnn_detector.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/mtcnn_detector.py index c7332a5c..ddc11f55 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/mtcnn_detector.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/mtcnn_detector.py @@ -6,6 +6,7 @@ import cv2 from multiprocessing import Pool from itertools import repeat + try: from itertools import izip except ImportError: @@ -13,62 +14,65 @@ from helper import nms, adjust_input, generate_bbox, detect_first_stage_warpper + class MtcnnDetector(object): """ - Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Neural Networks - see https://github.com/kpzhang93/MTCNN_face_detection_alignment - this is a mxnet version + Joint Face Detection and Alignment using Multi-task Cascaded Convolutional Neural Networks + see https://github.com/kpzhang93/MTCNN_face_detection_alignment + this is a mxnet version """ - def __init__(self, - model_folder='.', - minsize = 20, - threshold = [0.6, 0.7, 0.8], - factor = 0.709, - num_worker = 1, - accurate_landmark = False, - ctx=mx.cpu()): + + def __init__( + self, + model_folder=".", + minsize=20, + threshold=[0.6, 0.7, 0.8], + factor=0.709, + num_worker=1, + accurate_landmark=False, + ctx=mx.cpu(), + ): """ - Initialize the detector - - Parameters: - ---------- - model_folder : string - path for the models - minsize : float number - minimal face to detect - threshold : float number - detect threshold for 3 stages - factor: float number - scale factor for image pyramid - num_worker: int number - number of processes we use for first stage - accurate_landmark: bool - use accurate landmark localization or not + Initialize the detector + + Parameters: + ---------- + model_folder : string + path for the models + minsize : float number + minimal face to detect + threshold : float number + detect threshold for 3 stages + factor: float number + scale factor for image pyramid + num_worker: int number + number of processes we use for first stage + accurate_landmark: bool + use accurate landmark localization or not """ self.num_worker = num_worker self.accurate_landmark = accurate_landmark # load 4 models from folder - models = ['det1', 'det2', 'det3','det4'] - models = [ os.path.join(model_folder, f) for f in models] - + models = ["det1", "det2", "det3", "det4"] + models = [os.path.join(model_folder, f) for f in models] + self.PNets = [] for i in range(num_worker): workner_net = mx.model.FeedForward.load(models[0], 1, ctx=ctx) self.PNets.append(workner_net) - #self.Pool = Pool(num_worker) + # self.Pool = Pool(num_worker) self.RNet = mx.model.FeedForward.load(models[1], 1, ctx=ctx) self.ONet = mx.model.FeedForward.load(models[2], 1, ctx=ctx) self.LNet = mx.model.FeedForward.load(models[3], 1, ctx=ctx) - self.minsize = float(minsize) - self.factor = float(factor) + self.minsize = float(minsize) + self.factor = float(factor) self.threshold = threshold - def convert_to_square(self, bbox): """ convert bbox to square @@ -86,9 +90,9 @@ def convert_to_square(self, bbox): h = bbox[:, 3] - bbox[:, 1] + 1 w = bbox[:, 2] - bbox[:, 0] + 1 - max_side = np.maximum(h,w) - square_bbox[:, 0] = bbox[:, 0] + w*0.5 - max_side*0.5 - square_bbox[:, 1] = bbox[:, 1] + h*0.5 - max_side*0.5 + max_side = np.maximum(h, w) + square_bbox[:, 0] = bbox[:, 0] + w * 0.5 - max_side * 0.5 + square_bbox[:, 1] = bbox[:, 1] + h * 0.5 - max_side * 0.5 square_bbox[:, 2] = square_bbox[:, 0] + max_side - 1 square_bbox[:, 3] = square_bbox[:, 1] + max_side - 1 return square_bbox @@ -118,7 +122,6 @@ def calibrate_box(self, bbox, reg): bbox[:, 0:4] = bbox[:, 0:4] + aug return bbox - def pad(self, bboxes, w, h): """ pad the the bboxes, alse restrict the size of it @@ -145,19 +148,19 @@ def pad(self, bboxes, w, h): height and width of the bbox """ - tmpw, tmph = bboxes[:, 2] - bboxes[:, 0] + 1, bboxes[:, 3] - bboxes[:, 1] + 1 + tmpw, tmph = bboxes[:, 2] - bboxes[:, 0] + 1, bboxes[:, 3] - bboxes[:, 1] + 1 num_box = bboxes.shape[0] - dx , dy= np.zeros((num_box, )), np.zeros((num_box, )) - edx, edy = tmpw.copy()-1, tmph.copy()-1 + dx, dy = np.zeros((num_box,)), np.zeros((num_box,)) + edx, edy = tmpw.copy() - 1, tmph.copy() - 1 x, y, ex, ey = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2], bboxes[:, 3] - tmp_index = np.where(ex > w-1) + tmp_index = np.where(ex > w - 1) edx[tmp_index] = tmpw[tmp_index] + w - 2 - ex[tmp_index] ex[tmp_index] = w - 1 - tmp_index = np.where(ey > h-1) + tmp_index = np.where(ey > h - 1) edy[tmp_index] = tmph[tmp_index] + h - 2 - ey[tmp_index] ey[tmp_index] = h - 1 @@ -172,7 +175,7 @@ def pad(self, bboxes, w, h): return_list = [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] return_list = [item.astype(np.int32) for item in return_list] - return return_list + return return_list def slice_index(self, number): """ @@ -182,61 +185,75 @@ def slice_index(self, number): number: int number number """ + def chunks(l, n): """Yield successive n-sized chunks from l.""" for i in range(0, len(l), n): - yield l[i:i + n] + yield l[i : i + n] + num_list = range(number) return list(chunks(num_list, self.num_worker)) - + def detect_face_limited(self, img, det_type=2): height, width, _ = img.shape - if det_type>=2: - total_boxes = np.array( [ [0.0, 0.0, img.shape[1], img.shape[0], 0.9] ] ,dtype=np.float32) - num_box = total_boxes.shape[0] - - # pad the bbox - [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(total_boxes, width, height) - # (3, 24, 24) is the input shape for RNet - input_buf = np.zeros((num_box, 3, 24, 24), dtype=np.float32) - - for i in range(num_box): - tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) - tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = img[y[i]:ey[i]+1, x[i]:ex[i]+1, :] - input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (24, 24))) - - output = self.RNet.predict(input_buf) - - # filter the total_boxes with threshold - passed = np.where(output[1][:, 1] > self.threshold[1]) - total_boxes = total_boxes[passed] - - if total_boxes.size == 0: - return None - - total_boxes[:, 4] = output[1][passed, 1].reshape((-1,)) - reg = output[0][passed] - - # nms - pick = nms(total_boxes, 0.7, 'Union') - total_boxes = total_boxes[pick] - total_boxes = self.calibrate_box(total_boxes, reg[pick]) - total_boxes = self.convert_to_square(total_boxes) - total_boxes[:, 0:4] = np.round(total_boxes[:, 0:4]) + if det_type >= 2: + total_boxes = np.array( + [[0.0, 0.0, img.shape[1], img.shape[0], 0.9]], dtype=np.float32 + ) + num_box = total_boxes.shape[0] + + # pad the bbox + [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad( + total_boxes, width, height + ) + # (3, 24, 24) is the input shape for RNet + input_buf = np.zeros((num_box, 3, 24, 24), dtype=np.float32) + + for i in range(num_box): + tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) + tmp[dy[i] : edy[i] + 1, dx[i] : edx[i] + 1, :] = img[ + y[i] : ey[i] + 1, x[i] : ex[i] + 1, : + ] + input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (24, 24))) + + output = self.RNet.predict(input_buf) + + # filter the total_boxes with threshold + passed = np.where(output[1][:, 1] > self.threshold[1]) + total_boxes = total_boxes[passed] + + if total_boxes.size == 0: + return None + + total_boxes[:, 4] = output[1][passed, 1].reshape((-1,)) + reg = output[0][passed] + + # nms + pick = nms(total_boxes, 0.7, "Union") + total_boxes = total_boxes[pick] + total_boxes = self.calibrate_box(total_boxes, reg[pick]) + total_boxes = self.convert_to_square(total_boxes) + total_boxes[:, 0:4] = np.round(total_boxes[:, 0:4]) else: - total_boxes = np.array( [ [0.0, 0.0, img.shape[1], img.shape[0], 0.9] ] ,dtype=np.float32) + total_boxes = np.array( + [[0.0, 0.0, img.shape[1], img.shape[0], 0.9]], dtype=np.float32 + ) num_box = total_boxes.shape[0] - [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(total_boxes, width, height) + [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad( + total_boxes, width, height + ) # (3, 48, 48) is the input shape for ONet input_buf = np.zeros((num_box, 3, 48, 48), dtype=np.float32) for i in range(num_box): tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.float32) - tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = img[y[i]:ey[i]+1, x[i]:ex[i]+1, :] + tmp[dy[i] : edy[i] + 1, dx[i] : edx[i] + 1, :] = img[ + y[i] : ey[i] + 1, x[i] : ex[i] + 1, : + ] input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (48, 48))) output = self.ONet.predict(input_buf) - #print(output[2]) + # print(output[2]) # filter the total_boxes with threshold passed = np.where(output[2][:, 1] > self.threshold[2]) @@ -252,15 +269,21 @@ def detect_face_limited(self, img, det_type=2): # compute landmark points bbw = total_boxes[:, 2] - total_boxes[:, 0] + 1 bbh = total_boxes[:, 3] - total_boxes[:, 1] + 1 - points[:, 0:5] = np.expand_dims(total_boxes[:, 0], 1) + np.expand_dims(bbw, 1) * points[:, 0:5] - points[:, 5:10] = np.expand_dims(total_boxes[:, 1], 1) + np.expand_dims(bbh, 1) * points[:, 5:10] + points[:, 0:5] = ( + np.expand_dims(total_boxes[:, 0], 1) + + np.expand_dims(bbw, 1) * points[:, 0:5] + ) + points[:, 5:10] = ( + np.expand_dims(total_boxes[:, 1], 1) + + np.expand_dims(bbh, 1) * points[:, 5:10] + ) # nms total_boxes = self.calibrate_box(total_boxes, reg) - pick = nms(total_boxes, 0.7, 'Min') + pick = nms(total_boxes, 0.7, "Min") total_boxes = total_boxes[pick] points = points[pick] - + if not self.accurate_landmark: return total_boxes, points @@ -268,23 +291,30 @@ def detect_face_limited(self, img, det_type=2): # extended stage ############################################# num_box = total_boxes.shape[0] - patchw = np.maximum(total_boxes[:, 2]-total_boxes[:, 0]+1, total_boxes[:, 3]-total_boxes[:, 1]+1) - patchw = np.round(patchw*0.25) + patchw = np.maximum( + total_boxes[:, 2] - total_boxes[:, 0] + 1, + total_boxes[:, 3] - total_boxes[:, 1] + 1, + ) + patchw = np.round(patchw * 0.25) # make it even - patchw[np.where(np.mod(patchw,2) == 1)] += 1 + patchw[np.where(np.mod(patchw, 2) == 1)] += 1 input_buf = np.zeros((num_box, 15, 24, 24), dtype=np.float32) for i in range(5): - x, y = points[:, i], points[:, i+5] - x, y = np.round(x-0.5*patchw), np.round(y-0.5*patchw) - [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(np.vstack([x, y, x+patchw-1, y+patchw-1]).T, - width, - height) + x, y = points[:, i], points[:, i + 5] + x, y = np.round(x - 0.5 * patchw), np.round(y - 0.5 * patchw) + [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad( + np.vstack([x, y, x + patchw - 1, y + patchw - 1]).T, width, height + ) for j in range(num_box): tmpim = np.zeros((tmpw[j], tmpw[j], 3), dtype=np.float32) - tmpim[dy[j]:edy[j]+1, dx[j]:edx[j]+1, :] = img[y[j]:ey[j]+1, x[j]:ex[j]+1, :] - input_buf[j, i*3:i*3+3, :, :] = adjust_input(cv2.resize(tmpim, (24, 24))) + tmpim[dy[j] : edy[j] + 1, dx[j] : edx[j] + 1, :] = img[ + y[j] : ey[j] + 1, x[j] : ex[j] + 1, : + ] + input_buf[j, i * 3 : i * 3 + 3, :, :] = adjust_input( + cv2.resize(tmpim, (24, 24)) + ) output = self.LNet.predict(input_buf) @@ -293,11 +323,15 @@ def detect_face_limited(self, img, det_type=2): for k in range(5): # do not make a large movement - tmp_index = np.where(np.abs(output[k]-0.5) > 0.35) + tmp_index = np.where(np.abs(output[k] - 0.5) > 0.35) output[k][tmp_index[0]] = 0.5 - pointx[:, k] = np.round(points[:, k] - 0.5*patchw) + output[k][:, 0]*patchw - pointy[:, k] = np.round(points[:, k+5] - 0.5*patchw) + output[k][:, 1]*patchw + pointx[:, k] = ( + np.round(points[:, k] - 0.5 * patchw) + output[k][:, 0] * patchw + ) + pointy[:, k] = ( + np.round(points[:, k + 5] - 0.5 * patchw) + output[k][:, 1] * patchw + ) points = np.hstack([pointx, pointy]) points = points.astype(np.int32) @@ -321,7 +355,7 @@ def detect_face(self, img, det_type=0): # check input height, width, _ = img.shape - if det_type==0: + if det_type == 0: MIN_DET_SIZE = 12 if img is None: @@ -334,66 +368,78 @@ def detect_face(self, img, det_type=0): # detected boxes total_boxes = [] - minl = min( height, width) + minl = min(height, width) # get all the valid scales scales = [] - m = MIN_DET_SIZE/self.minsize + m = MIN_DET_SIZE / self.minsize minl *= m factor_count = 0 while minl > MIN_DET_SIZE: - scales.append(m*self.factor**factor_count) + scales.append(m * self.factor ** factor_count) minl *= self.factor factor_count += 1 ############################################# # first stage ############################################# - #for scale in scales: + # for scale in scales: # return_boxes = self.detect_first_stage(img, scale, 0) # if return_boxes is not None: # total_boxes.append(return_boxes) - + sliced_index = self.slice_index(len(scales)) total_boxes = [] for batch in sliced_index: - #local_boxes = self.Pool.map( detect_first_stage_warpper, \ + # local_boxes = self.Pool.map( detect_first_stage_warpper, \ # izip(repeat(img), self.PNets[:len(batch)], [scales[i] for i in batch], repeat(self.threshold[0])) ) - local_boxes = map( detect_first_stage_warpper, \ - izip(repeat(img), self.PNets[:len(batch)], [scales[i] for i in batch], repeat(self.threshold[0])) ) + local_boxes = map( + detect_first_stage_warpper, + izip( + repeat(img), + self.PNets[: len(batch)], + [scales[i] for i in batch], + repeat(self.threshold[0]), + ), + ) total_boxes.extend(local_boxes) - - # remove the Nones - total_boxes = [ i for i in total_boxes if i is not None] + + # remove the Nones + total_boxes = [i for i in total_boxes if i is not None] if len(total_boxes) == 0: return None - + total_boxes = np.vstack(total_boxes) if total_boxes.size == 0: return None # merge the detection from first stage - pick = nms(total_boxes[:, 0:5], 0.7, 'Union') + pick = nms(total_boxes[:, 0:5], 0.7, "Union") total_boxes = total_boxes[pick] bbw = total_boxes[:, 2] - total_boxes[:, 0] + 1 bbh = total_boxes[:, 3] - total_boxes[:, 1] + 1 # refine the bboxes - total_boxes = np.vstack([total_boxes[:, 0]+total_boxes[:, 5] * bbw, - total_boxes[:, 1]+total_boxes[:, 6] * bbh, - total_boxes[:, 2]+total_boxes[:, 7] * bbw, - total_boxes[:, 3]+total_boxes[:, 8] * bbh, - total_boxes[:, 4] - ]) + total_boxes = np.vstack( + [ + total_boxes[:, 0] + total_boxes[:, 5] * bbw, + total_boxes[:, 1] + total_boxes[:, 6] * bbh, + total_boxes[:, 2] + total_boxes[:, 7] * bbw, + total_boxes[:, 3] + total_boxes[:, 8] * bbh, + total_boxes[:, 4], + ] + ) total_boxes = total_boxes.T total_boxes = self.convert_to_square(total_boxes) total_boxes[:, 0:4] = np.round(total_boxes[:, 0:4]) else: - total_boxes = np.array( [ [0.0, 0.0, img.shape[1], img.shape[0], 0.9] ] ,dtype=np.float32) + total_boxes = np.array( + [[0.0, 0.0, img.shape[1], img.shape[0], 0.9]], dtype=np.float32 + ) ############################################# # second stage @@ -401,13 +447,17 @@ def detect_face(self, img, det_type=0): num_box = total_boxes.shape[0] # pad the bbox - [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(total_boxes, width, height) + [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad( + total_boxes, width, height + ) # (3, 24, 24) is the input shape for RNet input_buf = np.zeros((num_box, 3, 24, 24), dtype=np.float32) for i in range(num_box): tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.uint8) - tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = img[y[i]:ey[i]+1, x[i]:ex[i]+1, :] + tmp[dy[i] : edy[i] + 1, dx[i] : edx[i] + 1, :] = img[ + y[i] : ey[i] + 1, x[i] : ex[i] + 1, : + ] input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (24, 24))) output = self.RNet.predict(input_buf) @@ -423,7 +473,7 @@ def detect_face(self, img, det_type=0): reg = output[0][passed] # nms - pick = nms(total_boxes, 0.7, 'Union') + pick = nms(total_boxes, 0.7, "Union") total_boxes = total_boxes[pick] total_boxes = self.calibrate_box(total_boxes, reg[pick]) total_boxes = self.convert_to_square(total_boxes) @@ -435,13 +485,17 @@ def detect_face(self, img, det_type=0): num_box = total_boxes.shape[0] # pad the bbox - [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(total_boxes, width, height) + [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad( + total_boxes, width, height + ) # (3, 48, 48) is the input shape for ONet input_buf = np.zeros((num_box, 3, 48, 48), dtype=np.float32) for i in range(num_box): tmp = np.zeros((tmph[i], tmpw[i], 3), dtype=np.float32) - tmp[dy[i]:edy[i]+1, dx[i]:edx[i]+1, :] = img[y[i]:ey[i]+1, x[i]:ex[i]+1, :] + tmp[dy[i] : edy[i] + 1, dx[i] : edx[i] + 1, :] = img[ + y[i] : ey[i] + 1, x[i] : ex[i] + 1, : + ] input_buf[i, :, :, :] = adjust_input(cv2.resize(tmp, (48, 48))) output = self.ONet.predict(input_buf) @@ -460,15 +514,21 @@ def detect_face(self, img, det_type=0): # compute landmark points bbw = total_boxes[:, 2] - total_boxes[:, 0] + 1 bbh = total_boxes[:, 3] - total_boxes[:, 1] + 1 - points[:, 0:5] = np.expand_dims(total_boxes[:, 0], 1) + np.expand_dims(bbw, 1) * points[:, 0:5] - points[:, 5:10] = np.expand_dims(total_boxes[:, 1], 1) + np.expand_dims(bbh, 1) * points[:, 5:10] + points[:, 0:5] = ( + np.expand_dims(total_boxes[:, 0], 1) + + np.expand_dims(bbw, 1) * points[:, 0:5] + ) + points[:, 5:10] = ( + np.expand_dims(total_boxes[:, 1], 1) + + np.expand_dims(bbh, 1) * points[:, 5:10] + ) # nms total_boxes = self.calibrate_box(total_boxes, reg) - pick = nms(total_boxes, 0.7, 'Min') + pick = nms(total_boxes, 0.7, "Min") total_boxes = total_boxes[pick] points = points[pick] - + if not self.accurate_landmark: return total_boxes, points @@ -476,23 +536,30 @@ def detect_face(self, img, det_type=0): # extended stage ############################################# num_box = total_boxes.shape[0] - patchw = np.maximum(total_boxes[:, 2]-total_boxes[:, 0]+1, total_boxes[:, 3]-total_boxes[:, 1]+1) - patchw = np.round(patchw*0.25) + patchw = np.maximum( + total_boxes[:, 2] - total_boxes[:, 0] + 1, + total_boxes[:, 3] - total_boxes[:, 1] + 1, + ) + patchw = np.round(patchw * 0.25) # make it even - patchw[np.where(np.mod(patchw,2) == 1)] += 1 + patchw[np.where(np.mod(patchw, 2) == 1)] += 1 input_buf = np.zeros((num_box, 15, 24, 24), dtype=np.float32) for i in range(5): - x, y = points[:, i], points[:, i+5] - x, y = np.round(x-0.5*patchw), np.round(y-0.5*patchw) - [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad(np.vstack([x, y, x+patchw-1, y+patchw-1]).T, - width, - height) + x, y = points[:, i], points[:, i + 5] + x, y = np.round(x - 0.5 * patchw), np.round(y - 0.5 * patchw) + [dy, edy, dx, edx, y, ey, x, ex, tmpw, tmph] = self.pad( + np.vstack([x, y, x + patchw - 1, y + patchw - 1]).T, width, height + ) for j in range(num_box): tmpim = np.zeros((tmpw[j], tmpw[j], 3), dtype=np.float32) - tmpim[dy[j]:edy[j]+1, dx[j]:edx[j]+1, :] = img[y[j]:ey[j]+1, x[j]:ex[j]+1, :] - input_buf[j, i*3:i*3+3, :, :] = adjust_input(cv2.resize(tmpim, (24, 24))) + tmpim[dy[j] : edy[j] + 1, dx[j] : edx[j] + 1, :] = img[ + y[j] : ey[j] + 1, x[j] : ex[j] + 1, : + ] + input_buf[j, i * 3 : i * 3 + 3, :, :] = adjust_input( + cv2.resize(tmpim, (24, 24)) + ) output = self.LNet.predict(input_buf) @@ -501,19 +568,21 @@ def detect_face(self, img, det_type=0): for k in range(5): # do not make a large movement - tmp_index = np.where(np.abs(output[k]-0.5) > 0.35) + tmp_index = np.where(np.abs(output[k] - 0.5) > 0.35) output[k][tmp_index[0]] = 0.5 - pointx[:, k] = np.round(points[:, k] - 0.5*patchw) + output[k][:, 0]*patchw - pointy[:, k] = np.round(points[:, k+5] - 0.5*patchw) + output[k][:, 1]*patchw + pointx[:, k] = ( + np.round(points[:, k] - 0.5 * patchw) + output[k][:, 0] * patchw + ) + pointy[:, k] = ( + np.round(points[:, k + 5] - 0.5 * patchw) + output[k][:, 1] * patchw + ) points = np.hstack([pointx, pointy]) points = points.astype(np.int32) return total_boxes, points - - def list2colmatrix(self, pts_list): """ convert list to column matrix @@ -523,7 +592,7 @@ def list2colmatrix(self, pts_list): input list Retures: ------- - colMat: + colMat: """ assert len(pts_list) > 0 @@ -539,8 +608,8 @@ def find_tfrom_between_shapes(self, from_shape, to_shape): find transform between shapes Parameters: ---------- - from_shape: - to_shape: + from_shape: + to_shape: Retures: ------- tran_m: @@ -553,8 +622,8 @@ def find_tfrom_between_shapes(self, from_shape, to_shape): cov = np.matrix([[0.0, 0.0], [0.0, 0.0]]) # compute the mean and cov - from_shape_points = from_shape.reshape(from_shape.shape[0]/2, 2) - to_shape_points = to_shape.reshape(to_shape.shape[0]/2, 2) + from_shape_points = from_shape.reshape(from_shape.shape[0] / 2, 2) + to_shape_points = to_shape.reshape(to_shape.shape[0] / 2, 2) mean_from = from_shape_points.mean(axis=0) mean_to = to_shape_points.mean(axis=0) @@ -563,7 +632,9 @@ def find_tfrom_between_shapes(self, from_shape, to_shape): sigma_from += temp_dis * temp_dis temp_dis = np.linalg.norm(to_shape_points[i] - mean_to) sigma_to += temp_dis * temp_dis - cov += (to_shape_points[i].transpose() - mean_to.transpose()) * (from_shape_points[i] - mean_from) + cov += (to_shape_points[i].transpose() - mean_to.transpose()) * ( + from_shape_points[i] - mean_from + ) sigma_from = sigma_from / to_shape_points.shape[0] sigma_to = sigma_to / to_shape_points.shape[0] @@ -601,14 +672,14 @@ def extract_image_chips(self, img, points, desired_size=256, padding=0): Retures: ------- crop_imgs: list, n - cropped and aligned faces + cropped and aligned faces """ crop_imgs = [] for p in points: - shape =[] - for k in range(len(p)/2): + shape = [] + for k in range(len(p) / 2): shape.append(p[k]) - shape.append(p[k+5]) + shape.append(p[k + 5]) if padding > 0: padding = padding @@ -621,11 +692,11 @@ def extract_image_chips(self, img, points, desired_size=256, padding=0): from_points = [] to_points = [] - for i in range(len(shape)/2): + for i in range(len(shape) / 2): x = (padding + mean_face_shape_x[i]) / (2 * padding + 1) * desired_size y = (padding + mean_face_shape_y[i]) / (2 * padding + 1) * desired_size to_points.append([x, y]) - from_points.append([shape[2*i], shape[2*i+1]]) + from_points.append([shape[2 * i], shape[2 * i + 1]]) # convert the points to Mat from_mat = self.list2colmatrix(from_points) @@ -640,7 +711,7 @@ def extract_image_chips(self, img, points, desired_size=256, padding=0): scale = np.linalg.norm(probe_vec) angle = 180.0 / math.pi * math.atan2(probe_vec[1, 0], probe_vec[0, 0]) - from_center = [(shape[0]+shape[2])/2.0, (shape[1]+shape[3])/2.0] + from_center = [(shape[0] + shape[2]) / 2.0, (shape[1] + shape[3]) / 2.0] to_center = [0, 0] to_center[1] = desired_size * 0.4 to_center[0] = desired_size * 0.5 @@ -648,7 +719,9 @@ def extract_image_chips(self, img, points, desired_size=256, padding=0): ex = to_center[0] - from_center[0] ey = to_center[1] - from_center[1] - rot_mat = cv2.getRotationMatrix2D((from_center[0], from_center[1]), -1*angle, scale) + rot_mat = cv2.getRotationMatrix2D( + (from_center[0], from_center[1]), -1 * angle, scale + ) rot_mat[0][2] += ex rot_mat[1][2] += ey @@ -656,4 +729,3 @@ def extract_image_chips(self, img, points, desired_size=256, padding=0): crop_imgs.append(chips) return crop_imgs - diff --git a/egs/sre19-av-v/v0.1/steps_insightface/deploy/test.py b/egs/sre19-av-v/v0.1/steps_insightface/deploy/test.py index 6150a08b..cb14c86b 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/deploy/test.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/deploy/test.py @@ -4,31 +4,38 @@ import sys import numpy as np -parser = argparse.ArgumentParser(description='face model test') +parser = argparse.ArgumentParser(description="face model test") # general -parser.add_argument('--image-size', default='112,112', help='') -parser.add_argument('--model', default='', help='path to load model.') -parser.add_argument('--ga-model', default='', help='path to load model.') -parser.add_argument('--gpu', default=0, type=int, help='gpu id') -parser.add_argument('--det', default=0, type=int, help='mtcnn option, 1 means using R+O, 0 means detect from begining') -parser.add_argument('--flip', default=0, type=int, help='whether do lr flip aug') -parser.add_argument('--threshold', default=1.24, type=float, help='ver dist threshold') +parser.add_argument("--image-size", default="112,112", help="") +parser.add_argument("--model", default="", help="path to load model.") +parser.add_argument("--ga-model", default="", help="path to load model.") +parser.add_argument("--gpu", default=0, type=int, help="gpu id") +parser.add_argument( + "--det", + default=0, + type=int, + help="mtcnn option, 1 means using R+O, 0 means detect from begining", +) +parser.add_argument("--flip", default=0, type=int, help="whether do lr flip aug") +parser.add_argument("--threshold", default=1.24, type=float, help="ver dist threshold") args = parser.parse_args() model = face_model.FaceModel(args) -img = cv2.imread('Tom_Hanks_54745.png') +img = cv2.imread("Tom_Hanks_54745.png") img = model.get_input(img) -#f1 = model.get_feature(img) -#print(f1[0:10]) +# f1 = model.get_feature(img) +# print(f1[0:10]) gender, age = model.get_ga(img) print(gender) print(age) sys.exit(0) -img = cv2.imread('/raid5data/dplearn/megaface/facescrubr/112x112/Tom_Hanks/Tom_Hanks_54733.png') +img = cv2.imread( + "/raid5data/dplearn/megaface/facescrubr/112x112/Tom_Hanks/Tom_Hanks_54733.png" +) f2 = model.get_feature(img) -dist = np.sum(np.square(f1-f2)) +dist = np.sum(np.square(f1 - f2)) print(dist) sim = np.dot(f1, f2.T) print(sim) -#diff = np.subtract(source_feature, target_feature) -#dist = np.sum(np.square(diff),1) +# diff = np.subtract(source_feature, target_feature) +# dist = np.sum(np.square(diff),1) diff --git a/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py b/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py index b12a6a53..afd60d33 100755 --- a/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py @@ -272,10 +272,14 @@ def extract_face_embed( parser.add_argument("--faceembed-model-file", required=True) parser.add_argument("--use-gpu", default=False, action="store_true") parser.add_argument( - "--save-facedet-img", default=False, action="store_true", + "--save-facedet-img", + default=False, + action="store_true", ) parser.add_argument( - "--save-facecrop-img", default=False, action="store_true", + "--save-facecrop-img", + default=False, + action="store_true", ) parser.add_argument("--fps", type=int, default=1) parser.add_argument("--time-in-secs", default=False, action="store_true") diff --git a/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-image.py b/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-image.py index 6ea22e25..3ad59cc1 100755 --- a/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-image.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/extract-face-embed-from-image.py @@ -96,7 +96,9 @@ def extract_face_embed( while threshold > 0.01: retina_detect_faces = False logging.info( - "processing file %s frame of shape=%s", key, str(frame.shape), + "processing file %s frame of shape=%s", + key, + str(frame.shape), ) faces, landmarks = detect_faces_in_frame(detector, frame, thresh=threshold) logging.info("file %s dectected %d faces", key, faces.shape[0]) @@ -127,7 +129,9 @@ def extract_face_embed( x = x[valid] faces = faces[valid] logging.info( - "file %s extracted %d face embeds", key, faces.shape[0], + "file %s extracted %d face embeds", + key, + faces.shape[0], ) if save_facecrop_img: diff --git a/egs/sre19-av-v/v0.1/steps_insightface/list_utils.py b/egs/sre19-av-v/v0.1/steps_insightface/list_utils.py index 9a5306e3..81407f58 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/list_utils.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/list_utils.py @@ -14,45 +14,42 @@ def list2ndarray(a): - """Converts python string list to string numpy array. - """ + """Converts python string list to string numpy array.""" if isinstance(a, list): return np.asarray(a) assert isinstance(a, np.ndarray) return a - def ismember(a, b): """Replicates MATLAB ismember function. - - Returns: - For or arrays A and B returns an array of the same - size as A containing true where the elements of A are in B and false - otherwise. - - Also returns an array LOC containing the - lowest absolute index in B for each element in A which is a member of - B and 0 if there is no such index. + + Returns: + For or arrays A and B returns an array of the same + size as A containing true where the elements of A are in B and false + otherwise. + + Also returns an array LOC containing the + lowest absolute index in B for each element in A which is a member of + B and 0 if there is no such index. """ bad_idx = np.iinfo(np.int32).min d = {} for i, elt in enumerate(b): if elt not in d: d[elt] = i - loc = np.array([d.get(x, bad_idx) for x in a], dtype='int32') + loc = np.array([d.get(x, bad_idx) for x in a], dtype="int32") f = loc != bad_idx return f, loc - def sort(a, reverse=False, return_index=False): """Sorts a list or numpy array - + Args: a: string list or numpy array to sort. reverse: it True it sorts from high to low, otherwise from low to high. - return_index: It true it returns numpy array with the indices of the + return_index: It true it returns numpy array with the indices of the elements of a in the sorted array. Returns: Sorted numpy array. @@ -62,7 +59,7 @@ def sort(a, reverse=False, return_index=False): idx = np.argsort(a) if reverse: idx = idx[::-1] - if not(isinstance(a, np.ndarray)): + if not (isinstance(a, np.ndarray)): a = np.asarray(a) s_a = a[idx] return s_a, idx @@ -72,16 +69,15 @@ def sort(a, reverse=False, return_index=False): return np.sort(a) - -def intersect(a, b, assume_unique=False, return_index = False): +def intersect(a, b, assume_unique=False, return_index=False): """Computes the interseccion of a and b lists or numpy arrays. - + Args: a: First list to intersect. b: Second list to intersect. - assume_unique: If True, the input arrays are both assumed to be unique, + assume_unique: If True, the input arrays are both assumed to be unique, which can speed up the calculation. Default is False. - return_index: if True, it returns two numpy arrays with: + return_index: if True, it returns two numpy arrays with: - the indeces of the elements of a that are in a and b. - the indeces of the elements of b that are in a and b. @@ -99,7 +95,6 @@ def intersect(a, b, assume_unique=False, return_index = False): return c - def split_list(a, idx, num_parts): """Split a list into several parts and returns one of the parts. @@ -111,44 +106,42 @@ def split_list(a, idx, num_parts): Returns: A sublist of a. """ - if not(isinstance(a, np.ndarray)): + if not (isinstance(a, np.ndarray)): a = np.asarray(a) n = float(len(a)) - idx_1 = int(np.floor((idx-1)*n/num_parts)) - idx_2 = int(np.floor(idx*n/num_parts)) - loc = np.arange(idx_1, idx_2, dtype='int64') + idx_1 = int(np.floor((idx - 1) * n / num_parts)) + idx_2 = int(np.floor(idx * n / num_parts)) + loc = np.arange(idx_1, idx_2, dtype="int64") return a[loc], loc - -def split_list_group_by_key(a, idx, num_parts, key = None): +def split_list_group_by_key(a, idx, num_parts, key=None): """Split a list into several parts and returns one of the parts. It groups the elements of a with the same key into the same part. Args: a: list to split. idx: index of the part that we want to get from 1 to num_parts num_parts: number of parts to split the list. - key: List of properties of a, it groups the elements of a with the + key: List of properties of a, it groups the elements of a with the same key into the same part. Returns: A sublist of a. """ - if not(isinstance(a, np.ndarray)): + if not (isinstance(a, np.ndarray)): a = np.asarray(a) if key is None: key = a - _, ids=np.unique(key, return_inverse=True) - n = float(ids.max()+1) - idx_1 = int(np.floor((idx-1)*n/num_parts)) - idx_2 = int(np.floor(idx*n/num_parts)) - loc = np.empty(len(a), dtype='int64') + _, ids = np.unique(key, return_inverse=True) + n = float(ids.max() + 1) + idx_1 = int(np.floor((idx - 1) * n / num_parts)) + idx_2 = int(np.floor(idx * n / num_parts)) + loc = np.empty(len(a), dtype="int64") k = 0 for i in range(idx_1, idx_2): - loc_i = (ids==i).nonzero()[0] - loc[k:k+len(loc_i)] = loc_i + loc_i = (ids == i).nonzero()[0] + loc[k : k + len(loc_i)] = loc_i k += len(loc_i) loc = loc[:k] return a[loc], loc - diff --git a/egs/sre19-av-v/v0.1/steps_insightface/scp_list.py b/egs/sre19-av-v/v0.1/steps_insightface/scp_list.py index b84278a8..26da6d4f 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/scp_list.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/scp_list.py @@ -36,10 +36,8 @@ def __init__(self, key, file_path, offset=None, range_spec=None): self.key_to_index = None self.validate() - def validate(self): - """Validates the attributes of the SCPList object. - """ + """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) self.file_path = list2ndarray(self.file_path) assert len(self.key) == len(self.file_path) @@ -56,30 +54,23 @@ def validate(self): assert len(self.key) == self.range_spec.shape[0] assert self.range_spec.shape[1] == 2 - - def copy(self): """Makes a copy of the object.""" return deepcopy(self) - def __len__(self): """Returns the number of elements in the list.""" return len(self.key) - - + def len(self): """Returns the number of elements in the list.""" return len(self.key) - def _create_dict(self): - """Creates dictionary that returns the position of - a segment in the list. + """Creates dictionary that returns the position of + a segment in the list. """ - self.key_to_index = OrderedDict((k,i) for i, k in enumerate(self.key)) - - + self.key_to_index = OrderedDict((k, i) for i, k in enumerate(self.key)) def get_index(self, key): """Returns the position of key in the list.""" @@ -87,24 +78,21 @@ def get_index(self, key): self._create_dict() return self.key_to_index[key] - - def __contains__(self, key): - """ Returns True if the list contains the key""" + """Returns True if the list contains the key""" if self.key_to_index is None: self._create_dict() return key in self.key_to_index - - + def __getitem__(self, key): - """It allows to acces the data in the list by key or index like in + """It allows to acces the data in the list by key or index like in a ditionary, e.g.: If input is a string key: scp = SCPList(keys, file_paths, offsets, ranges) file_path, offset, range = scp['data1'] If input is an index: key, file_path, offset, range = scp[0] - + Args: key: String key or integer index. Returns: @@ -126,16 +114,13 @@ def __getitem__(self, key): else: return self.file_path[index], offset, range_spec - def add_prefix_to_filepath(self, prefix): """Adds a prefix to the file path""" self.file_path = np.array([prefix + p for p in self.file_path]) - - def sort(self): """Sorts the list by key""" - self.key, idx = sort(self.key, return_index=True) + self.key, idx = sort(self.key, return_index=True) self.file_path = self.file_path[idx] if self.offset is not None: self.offset = self.offset[idx] @@ -143,9 +128,7 @@ def sort(self): self.range_spec = self.range_spec[idx] self.key_to_index = None - - - def save(self, file_path, sep=' ', offset_sep=':'): + def save(self, file_path, sep=" ", offset_sep=":"): """Saves script list to text file. Args: @@ -154,74 +137,79 @@ def save(self, file_path, sep=' ', offset_sep=':'): offset_sep: Separator between file_path and offset. """ if self.range_spec is None: - range_spec = ['' for k in self.key] + range_spec = ["" for k in self.key] else: range_spec = [] for r in self.range_spec: if r[0] == 0 and r[1] == 0: - range_spec.append('') + range_spec.append("") elif r[1] == 0: - range_spec.append('[%d:]' % r[0]) + range_spec.append("[%d:]" % r[0]) else: - range_spec.append('[%d:%d]' % (r[0], r[0]+r[1])) - - - with open(file_path, 'w') as f: + range_spec.append("[%d:%d]" % (r[0], r[0] + r[1])) + + with open(file_path, "w") as f: if self.offset is None: for k, p, r in zip(self.key, self.file_path, range_spec): - f.write('%s%s%s%s\n' % (k, sep, p, r)) + f.write("%s%s%s%s\n" % (k, sep, p, r)) else: - for k, p, o, r in zip(self.key, self.file_path, self.offset, range_spec): - f.write('%s%s%s%s%d%s\n' % (k, sep, p, offset_sep, o, r)) - + for k, p, o, r in zip( + self.key, self.file_path, self.offset, range_spec + ): + f.write("%s%s%s%s%d%s\n" % (k, sep, p, offset_sep, o, r)) - @staticmethod def parse_script(script, offset_sep): """Parses the parts of the second field of the scp text file. - + Args: script: Second column of scp file. offset_sep: Separtor between file_path and offset. - + Returns: file_path, offset and range_spec. """ - file_range = [f.split('[', 1) for f in script] + file_range = [f.split("[", 1) for f in script] offset = None range_spec = None - + file_offset = [f[0].split(offset_sep, 1) for f in file_range] file_path = [f[0] for f in file_offset] - + if len(file_offset[0]) == 2: - offset = [int(f[1]) if len(f)==2 else -1 for f in file_offset] + offset = [int(f[1]) if len(f) == 2 else -1 for f in file_offset] if -1 in offset: - raise ValueError('Missing data position for %s' % f[0]) + raise ValueError("Missing data position for %s" % f[0]) do_range = False for f in file_range: - if len(f)==2: + if len(f) == 2: do_range = True break if do_range: - range_spec1 = [f[1].rstrip(']').split(':', 1) - if len(f)==2 else None for f in file_range] - range_spec21 = [int(f[0]) if f is not None and f[0].isdecimal() - else 0 for f in range_spec1] - range_spec22 = [int(f[1]) if f is not None and f[1].isdecimal() - else None for f in range_spec1] - range_spec = [[a, b-a] if b is not None else [a, 0] - for a,b in zip(range_spec21, range_spec22)] + range_spec1 = [ + f[1].rstrip("]").split(":", 1) if len(f) == 2 else None + for f in file_range + ] + range_spec21 = [ + int(f[0]) if f is not None and f[0].isdecimal() else 0 + for f in range_spec1 + ] + range_spec22 = [ + int(f[1]) if f is not None and f[1].isdecimal() else None + for f in range_spec1 + ] + range_spec = [ + [a, b - a] if b is not None else [a, 0] + for a, b in zip(range_spec21, range_spec22) + ] range_spec = np.array(range_spec, dtype=np.int64) - + return file_path, offset, range_spec - - @classmethod - def load(cls, file_path, sep=' ', offset_sep=':'): + def load(cls, file_path, sep=" ", offset_sep=":"): """Loads script list from text file. Args: @@ -233,22 +221,20 @@ def load(cls, file_path, sep=' ', offset_sep=':'): SCPList object. """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.rstrip().split(sep, 1) for line in f] key = [f[0] for f in fields] script = [f[1] for f in fields] file_path, offset, range_spec = SCPList.parse_script(script, offset_sep) return cls(key, file_path, offset, range_spec) - - def split(self, idx, num_parts, group_by_key=True): - """ Splits SCPList into num_parts and return part idx. - + """Splits SCPList into num_parts and return part idx. + Args: idx: Part to return from 1 to num_parts. num_parts: Number of parts to split the list. - group_by_key: If True, all the lines with the same key + group_by_key: If True, all the lines with the same key go to the same part. Returns: @@ -265,25 +251,23 @@ def split(self, idx, num_parts, group_by_key=True): offset = self.offset[idx1] if self.range_spec is not None: range_spec = self.range_spec[idx1] - - return SCPList(key, file_path, offset, range_spec) + return SCPList(key, file_path, offset, range_spec) - @classmethod def merge(cls, scp_lists): """Merges several SCPList. - + Args: scp_lists: List of SCPLists - + Returns: SCPList object concatenation the scp_lists. """ key_list = [item.key for item in scp_lists] file_list = [item.file_path for item in scp_lists] offset_list = [item.offset for item in scp_lists] - range_list = [item.range_spec for item in scp_lists] + range_list = [item.range_spec for item in scp_lists] key = np.concatenate(tuple(key_list)) file_path = np.concatenate(tuple(file_list)) @@ -300,11 +284,9 @@ def merge(cls, scp_lists): return cls(key, file_path, offset, range_spec) - - def filter(self, filter_key, keep=True): """Removes elements from SCPList ojbect by key - + Args: filter_key: List with the keys of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -321,21 +303,19 @@ def filter(self, filter_key, keep=True): f, _ = ismember(self.key, filter_key) key = self.key[f] file_path = self.file_path[f] - + offset = None range_spec = None if self.offset is not None: offset = self.offset[f] if self.range_spec is not None: range_spec = self.range_spec[f] - - return SCPList(key, file_path, offset, range_spec) + return SCPList(key, file_path, offset, range_spec) - def filter_paths(self, filter_key, keep=True): """Removes elements of SCPList by file_path - + Args: filter_key: List with the file_path of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -345,29 +325,27 @@ def filter_paths(self, filter_key, keep=True): SCPList object. """ - if not keep : + if not keep: filter_key = np.setdiff1d(self.file_path, filter_key) f, _ = ismember(filter_key, self.file_path) - assert(np.all(f)) + assert np.all(f) f, _ = ismember(self.file_path, filter_key) key = self.key[f] file_path = self.file_path[f] - + offset = None range_spec = None if self.offset is not None: offset = self.offset[f] if self.range_spec is not None: range_spec = self.range_spec[f] - - return SCPList(key, file_path, offset, range_spec) + return SCPList(key, file_path, offset, range_spec) - def filter_index(self, index, keep=True): """Removes elements of SCPList by index - + Args: filter_key: List with the index of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -377,9 +355,8 @@ def filter_index(self, index, keep=True): SCPList object. """ - if not keep : - index = np.setdiff1d(np.arange( - len(self.key), dtype=np.int64), index) + if not keep: + index = np.setdiff1d(np.arange(len(self.key), dtype=np.int64), index) key = self.key[index] file_path = self.file_path[index] @@ -389,11 +366,9 @@ def filter_index(self, index, keep=True): offset = self.offset[index] if self.range_spec is not None: range_spec = self.range_spec[index] - + return SCPList(key, file_path, offset, range_spec) - - - + def shuffle(self, seed=1024, rng=None): """Shuffles the elements of the list. @@ -408,7 +383,7 @@ def shuffle(self, seed=1024, rng=None): rng = np.random.RandomState(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) - + self.key = self.key[index] self.file_path = self.file_path[index] if self.offset is not None: @@ -419,8 +394,6 @@ def shuffle(self, seed=1024, rng=None): self.key_to_index = None return index - - def __eq__(self, other): """Equal operator""" if self.key.size == 0 and other.key.size == 0: @@ -430,46 +403,47 @@ def __eq__(self, other): eq = eq and (self.file_path.shape == other.file_path.shape) eq = eq and np.all(self.file_path == other.file_path) - if (self.offset is None and other.offset is not None or - self.offset is not None and other.offset is None): + if ( + self.offset is None + and other.offset is not None + or self.offset is not None + and other.offset is None + ): eq = False elif self.offset is not None and other.offset is not None: eq = eq and np.all(self.offset == other.offset) - if (self.range_spec is None and other.range_spec is not None or - self.range_spec is not None and other.range_spec is None): + if ( + self.range_spec is None + and other.range_spec is not None + or self.range_spec is not None + and other.range_spec is None + ): eq = False elif self.range_spec is not None and other.range_spec is not None: eq = eq and np.all(self.range_spec == other.range_spec) - - return eq + return eq - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(other): return 0 return 1 - - - # def test(): # key = ['spk1']+['spk2']*2+['spk3']*3+['spk10']*10 # file_path = np.arange(len(key)).astype('U') # file_txt = 'test.txt' - + # scp1 = SCPList(key, file_path) # scp1.sort() - + # scp1.save(file_txt) # scp2 = SCPList.load(file_txt) # assert(scp1 == scp2) @@ -493,7 +467,5 @@ def __cmp__(self, other): # f = np.zeros(len(key), dtype='bool') # f[1:13] = True # scp3 = SCPList(scp1.key[f], scp1.file_path[f]) - + # assert(scp2 == scp3) - - diff --git a/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_image.py b/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_image.py index f69a6de5..d5b74fa8 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_image.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_image.py @@ -1,4 +1,3 @@ - from easydict import EasyDict as edict import os import json @@ -6,264 +5,274 @@ def load_property(data_dir): - prop = edict() - for line in open(os.path.join(data_dir, 'property')): - vec = line.strip().split(',') - assert len(vec)==3 - prop.num_classes = int(vec[0]) - prop.image_size = [int(vec[1]), int(vec[2])] - return prop - + prop = edict() + for line in open(os.path.join(data_dir, "property")): + vec = line.strip().split(",") + assert len(vec) == 3 + prop.num_classes = int(vec[0]) + prop.image_size = [int(vec[1]), int(vec[2])] + return prop def get_dataset_webface(input_dir): - clean_list_file = input_dir+"_clean_list.txt" - ret = [] - for line in open(clean_list_file, 'r'): - vec = line.strip().split() - assert len(vec)==2 - fimage = edict() - fimage.id = vec[0].replace("\\", '/') - fimage.classname = vec[1] - fimage.image_path = os.path.join(input_dir, fimage.id) - ret.append(fimage) - return ret + clean_list_file = input_dir + "_clean_list.txt" + ret = [] + for line in open(clean_list_file, "r"): + vec = line.strip().split() + assert len(vec) == 2 + fimage = edict() + fimage.id = vec[0].replace("\\", "/") + fimage.classname = vec[1] + fimage.image_path = os.path.join(input_dir, fimage.id) + ret.append(fimage) + return ret + def get_dataset_celeb(input_dir): - clean_list_file = input_dir+"_clean_list.txt" - ret = [] - dir2label = {} - for line in open(clean_list_file, 'r'): - line = line.strip() - if not line.startswith('./m.'): - continue - line = line[2:] - vec = line.split('/') - assert len(vec)==2 - if vec[0] in dir2label: - label = dir2label[vec[0]] - else: - label = len(dir2label) - dir2label[vec[0]] = label + clean_list_file = input_dir + "_clean_list.txt" + ret = [] + dir2label = {} + for line in open(clean_list_file, "r"): + line = line.strip() + if not line.startswith("./m."): + continue + line = line[2:] + vec = line.split("/") + assert len(vec) == 2 + if vec[0] in dir2label: + label = dir2label[vec[0]] + else: + label = len(dir2label) + dir2label[vec[0]] = label + + fimage = edict() + fimage.id = line + fimage.classname = str(label) + fimage.image_path = os.path.join(input_dir, fimage.id) + ret.append(fimage) + return ret - fimage = edict() - fimage.id = line - fimage.classname = str(label) - fimage.image_path = os.path.join(input_dir, fimage.id) - ret.append(fimage) - return ret def _get_dataset_celeb(input_dir): - list_file = input_dir+"_original_list.txt" - ret = [] - for line in open(list_file, 'r'): - vec = line.strip().split() - assert len(vec)==2 - fimage = edict() - fimage.id = vec[0] - fimage.classname = vec[1] - fimage.image_path = os.path.join(input_dir, fimage.id) - ret.append(fimage) - return ret + list_file = input_dir + "_original_list.txt" + ret = [] + for line in open(list_file, "r"): + vec = line.strip().split() + assert len(vec) == 2 + fimage = edict() + fimage.id = vec[0] + fimage.classname = vec[1] + fimage.image_path = os.path.join(input_dir, fimage.id) + ret.append(fimage) + return ret + def get_dataset_facescrub(input_dir): - ret = [] - label = 0 - person_names = [] - for person_name in os.listdir(input_dir): - person_names.append(person_name) - person_names = sorted(person_names) - for person_name in person_names: - subdir = os.path.join(input_dir, person_name) - if not os.path.isdir(subdir): - continue - for _img in os.listdir(subdir): - fimage = edict() - fimage.id = os.path.join(person_name, _img) - fimage.classname = str(label) - fimage.image_path = os.path.join(subdir, _img) - fimage.landmark = None - fimage.bbox = None - ret.append(fimage) - label += 1 - return ret + ret = [] + label = 0 + person_names = [] + for person_name in os.listdir(input_dir): + person_names.append(person_name) + person_names = sorted(person_names) + for person_name in person_names: + subdir = os.path.join(input_dir, person_name) + if not os.path.isdir(subdir): + continue + for _img in os.listdir(subdir): + fimage = edict() + fimage.id = os.path.join(person_name, _img) + fimage.classname = str(label) + fimage.image_path = os.path.join(subdir, _img) + fimage.landmark = None + fimage.bbox = None + ret.append(fimage) + label += 1 + return ret + def get_dataset_megaface(input_dir): - ret = [] - label = 0 - for prefixdir in os.listdir(input_dir): - _prefixdir = os.path.join(input_dir, prefixdir) - for subdir in os.listdir(_prefixdir): - _subdir = os.path.join(_prefixdir, subdir) - if not os.path.isdir(_subdir): - continue - for img in os.listdir(_subdir): - if not img.endswith('.jpg.jpg') and img.endswith('.jpg'): - fimage = edict() - fimage.id = os.path.join(prefixdir, subdir, img) - fimage.classname = str(label) - fimage.image_path = os.path.join(_subdir, img) - json_file = fimage.image_path+".json" - data = None - fimage.bbox = None - fimage.landmark = None - if os.path.exists(json_file): - with open(json_file, 'r') as f: - data = f.read() - data = json.loads(data) - assert data is not None - if 'bounding_box' in data: - fimage.bbox = np.zeros( (4,), dtype=np.float32 ) - bb = data['bounding_box'] - fimage.bbox[0] = bb['x'] - fimage.bbox[1] = bb['y'] - fimage.bbox[2] = bb['x']+bb['width'] - fimage.bbox[3] = bb['y']+bb['height'] - #print('bb') - if 'landmarks' in data: - landmarks = data['landmarks'] - if '1' in landmarks and '0' in landmarks and '2' in landmarks: - fimage.landmark = np.zeros( (3,2), dtype=np.float32 ) - fimage.landmark[0][0] = landmarks['1']['x'] - fimage.landmark[0][1] = landmarks['1']['y'] - fimage.landmark[1][0] = landmarks['0']['x'] - fimage.landmark[1][1] = landmarks['0']['y'] - fimage.landmark[2][0] = landmarks['2']['x'] - fimage.landmark[2][1] = landmarks['2']['y'] - #print('lm') + ret = [] + label = 0 + for prefixdir in os.listdir(input_dir): + _prefixdir = os.path.join(input_dir, prefixdir) + for subdir in os.listdir(_prefixdir): + _subdir = os.path.join(_prefixdir, subdir) + if not os.path.isdir(_subdir): + continue + for img in os.listdir(_subdir): + if not img.endswith(".jpg.jpg") and img.endswith(".jpg"): + fimage = edict() + fimage.id = os.path.join(prefixdir, subdir, img) + fimage.classname = str(label) + fimage.image_path = os.path.join(_subdir, img) + json_file = fimage.image_path + ".json" + data = None + fimage.bbox = None + fimage.landmark = None + if os.path.exists(json_file): + with open(json_file, "r") as f: + data = f.read() + data = json.loads(data) + assert data is not None + if "bounding_box" in data: + fimage.bbox = np.zeros((4,), dtype=np.float32) + bb = data["bounding_box"] + fimage.bbox[0] = bb["x"] + fimage.bbox[1] = bb["y"] + fimage.bbox[2] = bb["x"] + bb["width"] + fimage.bbox[3] = bb["y"] + bb["height"] + # print('bb') + if "landmarks" in data: + landmarks = data["landmarks"] + if ( + "1" in landmarks + and "0" in landmarks + and "2" in landmarks + ): + fimage.landmark = np.zeros((3, 2), dtype=np.float32) + fimage.landmark[0][0] = landmarks["1"]["x"] + fimage.landmark[0][1] = landmarks["1"]["y"] + fimage.landmark[1][0] = landmarks["0"]["x"] + fimage.landmark[1][1] = landmarks["0"]["y"] + fimage.landmark[2][0] = landmarks["2"]["x"] + fimage.landmark[2][1] = landmarks["2"]["y"] + # print('lm') + + ret.append(fimage) + label += 1 + return ret - ret.append(fimage) - label+=1 - return ret def get_dataset_fgnet(input_dir): - ret = [] - label = 0 - for subdir in os.listdir(input_dir): - _subdir = os.path.join(input_dir, subdir) - if not os.path.isdir(_subdir): - continue - for img in os.listdir(_subdir): - if img.endswith('.JPG'): - fimage = edict() - fimage.id = os.path.join(_subdir, img) - fimage.classname = str(label) - fimage.image_path = os.path.join(_subdir, img) - json_file = fimage.image_path+".json" - data = None - fimage.bbox = None - fimage.landmark = None - if os.path.exists(json_file): - with open(json_file, 'r') as f: - data = f.read() - data = json.loads(data) - assert data is not None - if 'bounding_box' in data: - fimage.bbox = np.zeros( (4,), dtype=np.float32 ) - bb = data['bounding_box'] - fimage.bbox[0] = bb['x'] - fimage.bbox[1] = bb['y'] - fimage.bbox[2] = bb['x']+bb['width'] - fimage.bbox[3] = bb['y']+bb['height'] - #print('bb') - if 'landmarks' in data: - landmarks = data['landmarks'] - if '1' in landmarks and '0' in landmarks and '2' in landmarks: - fimage.landmark = np.zeros( (3,2), dtype=np.float32 ) - fimage.landmark[0][0] = landmarks['1']['x'] - fimage.landmark[0][1] = landmarks['1']['y'] - fimage.landmark[1][0] = landmarks['0']['x'] - fimage.landmark[1][1] = landmarks['0']['y'] - fimage.landmark[2][0] = landmarks['2']['x'] - fimage.landmark[2][1] = landmarks['2']['y'] - #print('lm') + ret = [] + label = 0 + for subdir in os.listdir(input_dir): + _subdir = os.path.join(input_dir, subdir) + if not os.path.isdir(_subdir): + continue + for img in os.listdir(_subdir): + if img.endswith(".JPG"): + fimage = edict() + fimage.id = os.path.join(_subdir, img) + fimage.classname = str(label) + fimage.image_path = os.path.join(_subdir, img) + json_file = fimage.image_path + ".json" + data = None + fimage.bbox = None + fimage.landmark = None + if os.path.exists(json_file): + with open(json_file, "r") as f: + data = f.read() + data = json.loads(data) + assert data is not None + if "bounding_box" in data: + fimage.bbox = np.zeros((4,), dtype=np.float32) + bb = data["bounding_box"] + fimage.bbox[0] = bb["x"] + fimage.bbox[1] = bb["y"] + fimage.bbox[2] = bb["x"] + bb["width"] + fimage.bbox[3] = bb["y"] + bb["height"] + # print('bb') + if "landmarks" in data: + landmarks = data["landmarks"] + if "1" in landmarks and "0" in landmarks and "2" in landmarks: + fimage.landmark = np.zeros((3, 2), dtype=np.float32) + fimage.landmark[0][0] = landmarks["1"]["x"] + fimage.landmark[0][1] = landmarks["1"]["y"] + fimage.landmark[1][0] = landmarks["0"]["x"] + fimage.landmark[1][1] = landmarks["0"]["y"] + fimage.landmark[2][0] = landmarks["2"]["x"] + fimage.landmark[2][1] = landmarks["2"]["y"] + # print('lm') + + # fimage.landmark = None + ret.append(fimage) + label += 1 + return ret - #fimage.landmark = None - ret.append(fimage) - label+=1 - return ret def get_dataset_ytf(input_dir): - ret = [] - label = 0 - person_names = [] - for person_name in os.listdir(input_dir): - person_names.append(person_name) - person_names = sorted(person_names) - for person_name in person_names: - _subdir = os.path.join(input_dir, person_name) - if not os.path.isdir(_subdir): - continue - for _subdir2 in os.listdir(_subdir): - _subdir2 = os.path.join(_subdir, _subdir2) - if not os.path.isdir(_subdir2): - continue - _ret = [] - for img in os.listdir(_subdir2): + ret = [] + label = 0 + person_names = [] + for person_name in os.listdir(input_dir): + person_names.append(person_name) + person_names = sorted(person_names) + for person_name in person_names: + _subdir = os.path.join(input_dir, person_name) + if not os.path.isdir(_subdir): + continue + for _subdir2 in os.listdir(_subdir): + _subdir2 = os.path.join(_subdir, _subdir2) + if not os.path.isdir(_subdir2): + continue + _ret = [] + for img in os.listdir(_subdir2): + fimage = edict() + fimage.id = os.path.join(_subdir2, img) + fimage.classname = str(label) + fimage.image_path = os.path.join(_subdir2, img) + fimage.bbox = None + fimage.landmark = None + _ret.append(fimage) + ret += _ret + label += 1 + return ret + + +def get_dataset_clfw(input_dir): + ret = [] + label = 0 + for img in os.listdir(input_dir): fimage = edict() - fimage.id = os.path.join(_subdir2, img) - fimage.classname = str(label) - fimage.image_path = os.path.join(_subdir2, img) + fimage.id = img + fimage.classname = str(0) + fimage.image_path = os.path.join(input_dir, img) fimage.bbox = None fimage.landmark = None - _ret.append(fimage) - ret += _ret - label+=1 - return ret - -def get_dataset_clfw(input_dir): - ret = [] - label = 0 - for img in os.listdir(input_dir): - fimage = edict() - fimage.id = img - fimage.classname = str(0) - fimage.image_path = os.path.join(input_dir, img) - fimage.bbox = None - fimage.landmark = None - ret.append(fimage) - return ret + ret.append(fimage) + return ret -def get_dataset_common(input_dir, min_images = 1): - ret = [] - label = 0 - person_names = [] - for person_name in os.listdir(input_dir): - person_names.append(person_name) - person_names = sorted(person_names) - for person_name in person_names: - _subdir = os.path.join(input_dir, person_name) - if not os.path.isdir(_subdir): - continue - _ret = [] - for img in os.listdir(_subdir): - fimage = edict() - fimage.id = os.path.join(person_name, img) - fimage.classname = str(label) - fimage.image_path = os.path.join(_subdir, img) - fimage.bbox = None - fimage.landmark = None - _ret.append(fimage) - if len(_ret)>=min_images: - ret += _ret - label+=1 - return ret -def get_dataset(name, input_dir): - if name=='webface' or name=='lfw' or name=='vgg': - return get_dataset_common(input_dir) - if name=='celeb': - return get_dataset_celeb(input_dir) - if name=='facescrub': - return get_dataset_facescrub(input_dir) - if name=='megaface': - return get_dataset_megaface(input_dir) - if name=='fgnet': - return get_dataset_fgnet(input_dir) - if name=='ytf': - return get_dataset_ytf(input_dir) - if name=='clfw': - return get_dataset_clfw(input_dir) - return None +def get_dataset_common(input_dir, min_images=1): + ret = [] + label = 0 + person_names = [] + for person_name in os.listdir(input_dir): + person_names.append(person_name) + person_names = sorted(person_names) + for person_name in person_names: + _subdir = os.path.join(input_dir, person_name) + if not os.path.isdir(_subdir): + continue + _ret = [] + for img in os.listdir(_subdir): + fimage = edict() + fimage.id = os.path.join(person_name, img) + fimage.classname = str(label) + fimage.image_path = os.path.join(_subdir, img) + fimage.bbox = None + fimage.landmark = None + _ret.append(fimage) + if len(_ret) >= min_images: + ret += _ret + label += 1 + return ret +def get_dataset(name, input_dir): + if name == "webface" or name == "lfw" or name == "vgg": + return get_dataset_common(input_dir) + if name == "celeb": + return get_dataset_celeb(input_dir) + if name == "facescrub": + return get_dataset_facescrub(input_dir) + if name == "megaface": + return get_dataset_megaface(input_dir) + if name == "fgnet": + return get_dataset_fgnet(input_dir) + if name == "ytf": + return get_dataset_ytf(input_dir) + if name == "clfw": + return get_dataset_clfw(input_dir) + return None diff --git a/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_preprocess.py b/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_preprocess.py index 0b59828a..00022078 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_preprocess.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/src/common/face_preprocess.py @@ -1,113 +1,112 @@ - import cv2 import numpy as np from skimage import transform as trans + def parse_lst_line(line): - vec = line.strip().split("\t") - assert len(vec)>=3 - aligned = int(vec[0]) - image_path = vec[1] - label = int(vec[2]) - bbox = None - landmark = None - #print(vec) - if len(vec)>3: - bbox = np.zeros( (4,), dtype=np.int32) - for i in xrange(3,7): - bbox[i-3] = int(vec[i]) + vec = line.strip().split("\t") + assert len(vec) >= 3 + aligned = int(vec[0]) + image_path = vec[1] + label = int(vec[2]) + bbox = None landmark = None - if len(vec)>7: - _l = [] - for i in xrange(7,17): - _l.append(float(vec[i])) - landmark = np.array(_l).reshape( (2,5) ).T - #print(aligned) - return image_path, label, bbox, landmark, aligned - - + # print(vec) + if len(vec) > 3: + bbox = np.zeros((4,), dtype=np.int32) + for i in xrange(3, 7): + bbox[i - 3] = int(vec[i]) + landmark = None + if len(vec) > 7: + _l = [] + for i in xrange(7, 17): + _l.append(float(vec[i])) + landmark = np.array(_l).reshape((2, 5)).T + # print(aligned) + return image_path, label, bbox, landmark, aligned def read_image(img_path, **kwargs): - mode = kwargs.get('mode', 'rgb') - layout = kwargs.get('layout', 'HWC') - if mode=='gray': - img = cv2.imread(img_path, cv2.CV_LOAD_IMAGE_GRAYSCALE) - else: - img = cv2.imread(img_path, cv2.CV_LOAD_IMAGE_COLOR) - if mode=='rgb': - #print('to rgb') - img = img[...,::-1] - if layout=='CHW': - img = np.transpose(img, (2,0,1)) - return img - - -def preprocess(img, bbox=None, landmark=None, **kwargs): - if isinstance(img, str): - img = read_image(img, **kwargs) - M = None - image_size = [] - str_image_size = kwargs.get('image_size', '') - if len(str_image_size)>0: - image_size = [int(x) for x in str_image_size.split(',')] - if len(image_size)==1: - image_size = [image_size[0], image_size[0]] - assert len(image_size)==2 - assert image_size[0]==112 - assert image_size[0]==112 or image_size[1]==96 - if landmark is not None: - assert len(image_size)==2 - src = np.array([ - [30.2946, 51.6963], - [65.5318, 51.5014], - [48.0252, 71.7366], - [33.5493, 92.3655], - [62.7299, 92.2041] ], dtype=np.float32 ) - if image_size[1]==112: - src[:,0] += 8.0 - dst = landmark.astype(np.float32) - - tform = trans.SimilarityTransform() - tform.estimate(dst, src) - M = tform.params[0:2,:] - #M = cv2.estimateRigidTransform( dst.reshape(1,5,2), src.reshape(1,5,2), False) - - if M is None: - if bbox is None: #use center crop - det = np.zeros(4, dtype=np.int32) - det[0] = int(img.shape[1]*0.0625) - det[1] = int(img.shape[0]*0.0625) - det[2] = img.shape[1] - det[0] - det[3] = img.shape[0] - det[1] + mode = kwargs.get("mode", "rgb") + layout = kwargs.get("layout", "HWC") + if mode == "gray": + img = cv2.imread(img_path, cv2.CV_LOAD_IMAGE_GRAYSCALE) else: - det = bbox - margin = kwargs.get('margin', 44) - bb = np.zeros(4, dtype=np.int32) - bb[0] = np.maximum(det[0]-margin/2, 0) - bb[1] = np.maximum(det[1]-margin/2, 0) - bb[2] = np.minimum(det[2]+margin/2, img.shape[1]) - bb[3] = np.minimum(det[3]+margin/2, img.shape[0]) - ret = img[bb[1]:bb[3],bb[0]:bb[2],:] - if len(image_size)>0: - ret = cv2.resize(ret, (image_size[1], image_size[0])) - return ret - else: #do align using landmark - assert len(image_size)==2 - - #src = src[0:3,:] - #dst = dst[0:3,:] - - - #print(src.shape, dst.shape) - #print(src) - #print(dst) - #print(M) - warped = cv2.warpAffine(img,M,(image_size[1],image_size[0]), borderValue = 0.0) - - #tform3 = trans.ProjectiveTransform() - #tform3.estimate(src, dst) - #warped = trans.warp(img, tform3, output_shape=_shape) - return warped + img = cv2.imread(img_path, cv2.CV_LOAD_IMAGE_COLOR) + if mode == "rgb": + # print('to rgb') + img = img[..., ::-1] + if layout == "CHW": + img = np.transpose(img, (2, 0, 1)) + return img +def preprocess(img, bbox=None, landmark=None, **kwargs): + if isinstance(img, str): + img = read_image(img, **kwargs) + M = None + image_size = [] + str_image_size = kwargs.get("image_size", "") + if len(str_image_size) > 0: + image_size = [int(x) for x in str_image_size.split(",")] + if len(image_size) == 1: + image_size = [image_size[0], image_size[0]] + assert len(image_size) == 2 + assert image_size[0] == 112 + assert image_size[0] == 112 or image_size[1] == 96 + if landmark is not None: + assert len(image_size) == 2 + src = np.array( + [ + [30.2946, 51.6963], + [65.5318, 51.5014], + [48.0252, 71.7366], + [33.5493, 92.3655], + [62.7299, 92.2041], + ], + dtype=np.float32, + ) + if image_size[1] == 112: + src[:, 0] += 8.0 + dst = landmark.astype(np.float32) + + tform = trans.SimilarityTransform() + tform.estimate(dst, src) + M = tform.params[0:2, :] + # M = cv2.estimateRigidTransform( dst.reshape(1,5,2), src.reshape(1,5,2), False) + + if M is None: + if bbox is None: # use center crop + det = np.zeros(4, dtype=np.int32) + det[0] = int(img.shape[1] * 0.0625) + det[1] = int(img.shape[0] * 0.0625) + det[2] = img.shape[1] - det[0] + det[3] = img.shape[0] - det[1] + else: + det = bbox + margin = kwargs.get("margin", 44) + bb = np.zeros(4, dtype=np.int32) + bb[0] = np.maximum(det[0] - margin / 2, 0) + bb[1] = np.maximum(det[1] - margin / 2, 0) + bb[2] = np.minimum(det[2] + margin / 2, img.shape[1]) + bb[3] = np.minimum(det[3] + margin / 2, img.shape[0]) + ret = img[bb[1] : bb[3], bb[0] : bb[2], :] + if len(image_size) > 0: + ret = cv2.resize(ret, (image_size[1], image_size[0])) + return ret + else: # do align using landmark + assert len(image_size) == 2 + + # src = src[0:3,:] + # dst = dst[0:3,:] + + # print(src.shape, dst.shape) + # print(src) + # print(dst) + # print(M) + warped = cv2.warpAffine(img, M, (image_size[1], image_size[0]), borderValue=0.0) + + # tform3 = trans.ProjectiveTransform() + # tform3.estimate(src, dst) + # warped = trans.warp(img, tform3, output_shape=_shape) + return warped diff --git a/egs/sre19-av-v/v0.1/steps_insightface/src/common/noise_sgd.py b/egs/sre19-av-v/v0.1/steps_insightface/src/common/noise_sgd.py index 70024ec7..4a306d93 100644 --- a/egs/sre19-av-v/v0.1/steps_insightface/src/common/noise_sgd.py +++ b/egs/sre19-av-v/v0.1/steps_insightface/src/common/noise_sgd.py @@ -1,20 +1,22 @@ import mxnet.optimizer as optimizer from mxnet import ndarray as nd + class NoiseSGD(optimizer.SGD): """Noise SGD. This optimizer accepts the same arguments as :class:`.SGD`. """ + def __init__(self, scale, **kwargs): super(NoiseSGD, self).__init__(**kwargs) - print('init noise sgd with', scale) + print("init noise sgd with", scale) self.scale = scale def update(self, index, weight, grad, state): - assert(isinstance(weight, NDArray)) - assert(isinstance(grad, NDArray)) + assert isinstance(weight, NDArray) + assert isinstance(grad, NDArray) self._update_count(index) lr = self._get_lr(index) wd = self._get_wd(index) @@ -22,7 +24,9 @@ def update(self, index, weight, grad, state): grad = grad * self.rescale_grad if self.clip_gradient is not None: grad = clip(grad, -self.clip_gradient, self.clip_gradient) - noise = nd.random.normal(scale = self.scale, shape = grad.shape, dtype=grad.dtype, ctx = grad.context) + noise = nd.random.normal( + scale=self.scale, shape=grad.shape, dtype=grad.dtype, ctx=grad.context + ) grad += noise if state is not None: @@ -35,4 +39,3 @@ def update(self, index, weight, grad, state): else: assert self.momentum == 0.0 weight[:] += -lr * (grad + wd * weight) - diff --git a/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py b/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py index fe417942..92cbb1fa 100755 --- a/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py +++ b/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-bbox-plus-face-det-v4.py @@ -272,13 +272,19 @@ def __init__(self): parser.add_argument("--faceembed-model-file", required=True) parser.add_argument("--use-gpu", default=False, action="store_true") parser.add_argument( - "--save-facedet-img", default=False, action="store_true", + "--save-facedet-img", + default=False, + action="store_true", ) parser.add_argument( - "--save-facecrop-img", default=False, action="store_true", + "--save-facecrop-img", + default=False, + action="store_true", ) parser.add_argument( - "--save-facealign-img", default=False, action="store_true", + "--save-facealign-img", + default=False, + action="store_true", ) parser.add_argument("--fps", type=int, default=1) parser.add_argument("--time-in-secs", default=False, action="store_true") diff --git a/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-image.py b/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-image.py index 3198ff63..fc2e5ea1 100755 --- a/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-image.py +++ b/egs/sre19-av-v/v0.2/steps_insightface/extract-face-embed-from-image.py @@ -109,7 +109,9 @@ def __init__(self): threshold = 0.9 while threshold > 0.01: logging.info( - "processing file %s of shape=%s", key, str(frame.shape), + "processing file %s of shape=%s", + key, + str(frame.shape), ) faces, landmarks = detect_faces_in_frame(detector, frame, thresh=threshold) logging.info("file %s detected %d faces", key, faces.shape[0]) @@ -135,7 +137,9 @@ def __init__(self): device=device, ) logging.info( - "file %s extracted %d face embeds", key, faces.shape[0], + "file %s extracted %d face embeds", + key, + faces.shape[0], ) if save_facecrop_img: diff --git a/egs/sre19-cmn2/v1/local/add_to_datadir.py b/egs/sre19-cmn2/v1/local/add_to_datadir.py index ecd9612f..a183e4c8 100755 --- a/egs/sre19-cmn2/v1/local/add_to_datadir.py +++ b/egs/sre19-cmn2/v1/local/add_to_datadir.py @@ -5,23 +5,23 @@ import sys, re from xml.sax.saxutils import unescape -basename=sys.argv[1] +basename = sys.argv[1] outdir = sys.argv[2] if len(sys.argv) > 3: - mer_thresh=float(sys.argv[3]) + mer_thresh = float(sys.argv[3]) else: mer_thresh = None # open the output files in append mode -segments_file = open(outdir + '/segments', 'a') -utt2spk_file = open(outdir + '/utt2spk', 'a') -text_file = open(outdir + '/text', 'a') +segments_file = open(outdir + "/segments", "a") +utt2spk_file = open(outdir + "/utt2spk", "a") +text_file = open(outdir + "/text", "a") for line in sys.stdin: - m = re.match(r'\w+speaker(\d+)\w+\s+(.*)', line) - #print line + m = re.match(r"\w+speaker(\d+)\w+\s+(.*)", line) + # print line if m: @@ -31,21 +31,19 @@ start = float(t[0]) end = float(t[1]) mer = float(t[2]) - - s = [unescape(w) for w in t[3:]] - words = ' '.join(s) - segId = '%s_spk-%04d_seg-%07d:%07d' % (basename, spk, start*100, end*100) - spkId = '%s_spk-%04d' % (basename, spk) + s = [unescape(w) for w in t[3:]] + words = " ".join(s) + + segId = "%s_spk-%04d_seg-%07d:%07d" % (basename, spk, start * 100, end * 100) + spkId = "%s_spk-%04d" % (basename, spk) # only add segments where the Matching Error Rate is below the prescribed threshhold if mer_thresh == None or mer <= mer_thresh: - print >> segments_file, '%s %s %.2f %.2f' % (segId, basename, start, end ) - print >> text_file, '%s %s' % (segId, words) - print >> utt2spk_file, '%s %s' % (segId, spkId) + print >> segments_file, "%s %s %.2f %.2f" % (segId, basename, start, end) + print >> text_file, "%s %s" % (segId, words) + print >> utt2spk_file, "%s %s" % (segId, spkId) segments_file.close() utt2spk_file.close() text_file.close() - - diff --git a/egs/sre19-cmn2/v1/local/count-speech-frames.py b/egs/sre19-cmn2/v1/local/count-speech-frames.py index 87cb85ea..60de31da 100755 --- a/egs/sre19-cmn2/v1/local/count-speech-frames.py +++ b/egs/sre19-cmn2/v1/local/count-speech-frames.py @@ -4,9 +4,6 @@ """ - - - import sys import os import argparse @@ -22,25 +19,25 @@ def count_speech_frames(vad_file, list_file, output_file): u2i = Utt2Info.load(list_file) r = DRF.create(vad_file) - with open(output_file, 'w') as f: + with open(output_file, "w") as f: for key in u2i.key: vad = r.read(key) nf = np.sum(vad) - f.write('%s %d\n' % (key, nf)) - + f.write("%s %d\n" % (key, nf)) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 video condition') + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 video condition", + ) - parser.add_argument('--vad-file', dest='vad_file', required=True) - parser.add_argument('--list-file', dest='list_file', required=True) - parser.add_argument('--output-file', dest='output_file', required=True) + parser.add_argument("--vad-file", dest="vad_file", required=True) + parser.add_argument("--list-file", dest="list_file", required=True) + parser.add_argument("--output-file", dest="output_file", required=True) - args=parser.parse_args() - - count_speech_frames(**vars(args)) + args = parser.parse_args() + count_speech_frames(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/error_analysis.py b/egs/sre19-cmn2/v1/local/error_analysis.py index d142e618..c4dbba5a 100755 --- a/egs/sre19-cmn2/v1/local/error_analysis.py +++ b/egs/sre19-cmn2/v1/local/error_analysis.py @@ -5,9 +5,6 @@ """ - - - import sys import os import argparse @@ -24,25 +21,33 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) scr = scr.align_with_ndx(key) p = 0.05 - thr = - np.log(p/(1-p)) - - tar_enr_scr = np.sum(scr.scores * key.tar, axis=1)/(np.sum(key.tar, axis=1)+1e-5) - tar_tst_scr = np.sum(scr.scores * key.tar, axis=0)/(np.sum(key.tar, axis=0)+1e-5) - non_enr_scr = np.sum(scr.scores * key.non, axis=1)/(np.sum(key.non, axis=1)+1e-5) - non_tst_scr = np.sum(scr.scores * key.non, axis=0)/(np.sum(key.non, axis=0)+1e-5) + thr = -np.log(p / (1 - p)) + + tar_enr_scr = np.sum(scr.scores * key.tar, axis=1) / ( + np.sum(key.tar, axis=1) + 1e-5 + ) + tar_tst_scr = np.sum(scr.scores * key.tar, axis=0) / ( + np.sum(key.tar, axis=0) + 1e-5 + ) + non_enr_scr = np.sum(scr.scores * key.non, axis=1) / ( + np.sum(key.non, axis=1) + 1e-5 + ) + non_tst_scr = np.sum(scr.scores * key.non, axis=0) / ( + np.sum(key.non, axis=0) + 1e-5 + ) tar_enr_err = np.sum(np.logical_and(scr.scores < thr, key.tar), axis=1) tar_tst_err = np.sum(np.logical_and(scr.scores < thr, key.tar), axis=0) non_enr_err = np.sum(np.logical_and(scr.scores > thr, key.non), axis=1) non_tst_err = np.sum(np.logical_and(scr.scores > thr, key.non), axis=0) - + tar_enr_idx = np.argsort(tar_enr_scr) tar_tst_idx = np.argsort(tar_tst_scr) non_enr_idx = np.argsort(non_enr_scr)[::-1] @@ -52,45 +57,53 @@ def score_dcf(key_file, score_file, output_path): if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_worse_tar_enr' - with open(output_file, 'w') as f: + output_file = output_path + "_worse_tar_enr" + with open(output_file, "w") as f: for idx in tar_enr_idx: - f.write('%s %f %d\n' % (key.model_set[idx], tar_enr_scr[idx], tar_enr_err[idx])) + f.write( + "%s %f %d\n" % (key.model_set[idx], tar_enr_scr[idx], tar_enr_err[idx]) + ) - output_file = output_path + '_worse_tar_tst' - with open(output_file, 'w') as f: + output_file = output_path + "_worse_tar_tst" + with open(output_file, "w") as f: for idx in tar_tst_idx: - f.write('%s %f %d\n' % (key.seg_set[idx], tar_tst_scr[idx], tar_tst_err[idx])) + f.write( + "%s %f %d\n" % (key.seg_set[idx], tar_tst_scr[idx], tar_tst_err[idx]) + ) - output_file = output_path + '_worse_non_enr' - with open(output_file, 'w') as f: + output_file = output_path + "_worse_non_enr" + with open(output_file, "w") as f: for idx in non_enr_idx: - f.write('%s %f %d\n' % (key.model_set[idx], non_enr_scr[idx], non_enr_err[idx])) + f.write( + "%s %f %d\n" % (key.model_set[idx], non_enr_scr[idx], non_enr_err[idx]) + ) - output_file = output_path + '_worse_non_tst' - with open(output_file, 'w') as f: + output_file = output_path + "_worse_non_tst" + with open(output_file, "w") as f: for idx in non_tst_idx: - f.write('%s %f %d\n' % (key.seg_set[idx], non_tst_scr[idx], non_tst_err[idx])) - + f.write( + "%s %f %d\n" % (key.seg_set[idx], non_tst_scr[idx], non_tst_err[idx]) + ) + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') - - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/make_dihard_train.py b/egs/sre19-cmn2/v1/local/make_dihard_train.py index 62e6a4b8..7b2fad0a 100755 --- a/egs/sre19-cmn2/v1/local/make_dihard_train.py +++ b/egs/sre19-cmn2/v1/local/make_dihard_train.py @@ -15,76 +15,109 @@ import numpy as np import pandas as pd + def find_audios(wav_path): command = 'find %s -name "*.flac"' % (wav_path) - wavs = subprocess.check_output(command, shell=True).decode('utf-8').splitlines() - keys = [ os.path.splitext(os.path.basename(wav))[0] for wav in wavs ] - data = {'key': keys, 'file_path': wavs} + wavs = subprocess.check_output(command, shell=True).decode("utf-8").splitlines() + keys = [os.path.splitext(os.path.basename(wav))[0] for wav in wavs] + data = {"key": keys, "file_path": wavs} df_wav = pd.DataFrame(data) return df_wav def rttm_is_sorted_by_tbeg(rttm): - tbeg=rttm['tbeg'].values - file_id=rttm['file_id'].values - return np.all(np.logical_or(tbeg[1:]-tbeg[:-1]>=0, - file_id[1:] != file_id[:-1])) - -def sort_rttm(rttm): - return rttm.sort_values(by=['file_id','tbeg']) + tbeg = rttm["tbeg"].values + file_id = rttm["file_id"].values + return np.all(np.logical_or(tbeg[1:] - tbeg[:-1] >= 0, file_id[1:] != file_id[:-1])) -def read_rttm(rttm_file, uem_file=None, sep=' '): - - rttm = pd.read_csv(rttm_file, sep=sep, header=None, - names=['segment_type','file_id','chnl','tbeg','tdur', - 'ortho','stype','name','conf','slat']) - #remove empty lines: - index = (rttm['tdur']>= 0.025) +def sort_rttm(rttm): + return rttm.sort_values(by=["file_id", "tbeg"]) + + +def read_rttm(rttm_file, uem_file=None, sep=" "): + + rttm = pd.read_csv( + rttm_file, + sep=sep, + header=None, + names=[ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ], + ) + # remove empty lines: + index = rttm["tdur"] >= 0.025 rttm = rttm[index] - rttm['ortho'] = '' - rttm['stype'] = '' + rttm["ortho"] = "" + rttm["stype"] = "" if not rttm_is_sorted_by_tbeg(rttm): - print('RTTM %s not properly sorted, sorting it' % (rttm_file)) + print("RTTM %s not properly sorted, sorting it" % (rttm_file)) rttm = sort_rttm(rttm) - #cross with uem + # cross with uem if uem_file is not None: - uem = pd.read_csv(uem_file, sep=' ', header=None, - names=['file_id','chnl','file_tbeg','file_tend']) - rttm_uem = pd.merge(left=rttm, right=uem, on=['file_id', 'chnl']) - - #fix exceding file duration - index_fix=(rttm_uem['tbeg'] < rttm_uem['file_tend']) & (rttm_uem['tbeg'] + rttm_uem['tdur']> rttm_uem['file_tend']) + uem = pd.read_csv( + uem_file, + sep=" ", + header=None, + names=["file_id", "chnl", "file_tbeg", "file_tend"], + ) + rttm_uem = pd.merge(left=rttm, right=uem, on=["file_id", "chnl"]) + + # fix exceding file duration + index_fix = (rttm_uem["tbeg"] < rttm_uem["file_tend"]) & ( + rttm_uem["tbeg"] + rttm_uem["tdur"] > rttm_uem["file_tend"] + ) if np.sum(index_fix) > 0: - print('fixing %d segments with exceding file duration' % (np.sum(index_fix))) - #print(rttm_uem[index_fix]) - rttm_uem.loc[index_fix, 'tdur'] = rttm_uem[index_fix].file_tend - rttm_uem[index_fix].tbeg - - index_keep=(rttm_uem['tbeg'] < rttm_uem['file_tend']) + print( + "fixing %d segments with exceding file duration" % (np.sum(index_fix)) + ) + # print(rttm_uem[index_fix]) + rttm_uem.loc[index_fix, "tdur"] = ( + rttm_uem[index_fix].file_tend - rttm_uem[index_fix].tbeg + ) + + index_keep = rttm_uem["tbeg"] < rttm_uem["file_tend"] n_rm = rttm_uem.shape[0] - np.sum(index_keep) if n_rm > 0: - print('removing %d segments that start after file tend' % (n_rm)) - #print(rttm_uem[~index_keep]) + print("removing %d segments that start after file tend" % (n_rm)) + # print(rttm_uem[~index_keep]) rttm_uem = rttm_uem[index_keep] - index_fix=(rttm_uem['tbeg'] < rttm_uem['file_tbeg']) & (rttm_uem['tbeg'] + rttm_uem['tdur']> rttm_uem['file_tbeg']) + index_fix = (rttm_uem["tbeg"] < rttm_uem["file_tbeg"]) & ( + rttm_uem["tbeg"] + rttm_uem["tdur"] > rttm_uem["file_tbeg"] + ) if np.sum(index_fix) > 0: - print('fixing %d segments that start before file tbeg' % (np.sum(index_fix))) - #print(rttm_uem[index_fix]) - rttm_uem.loc[index_fix, 'tdur'] = rttm_uem[index_fix].tbeg + rttm_uem[index_fix].tdur - rttm_uem[index_fix].file_tbeg - rttm_uem.loc[index_fix, 'tbeg'] = rttm_uem[index_fix].file_tbeg - - index_keep=(rttm_uem['tbeg'] + rttm_uem['tdur'] > rttm_uem['file_tbeg']) + print( + "fixing %d segments that start before file tbeg" % (np.sum(index_fix)) + ) + # print(rttm_uem[index_fix]) + rttm_uem.loc[index_fix, "tdur"] = ( + rttm_uem[index_fix].tbeg + + rttm_uem[index_fix].tdur + - rttm_uem[index_fix].file_tbeg + ) + rttm_uem.loc[index_fix, "tbeg"] = rttm_uem[index_fix].file_tbeg + + index_keep = rttm_uem["tbeg"] + rttm_uem["tdur"] > rttm_uem["file_tbeg"] n_rm = rttm_uem.shape[0] - np.sum(index_keep) if n_rm > 0: - print('removing %d segments that end before tbeg' % (n_rm)) - #print(rttm_uem[~index_keep]) + print("removing %d segments that end before tbeg" % (n_rm)) + # print(rttm_uem[~index_keep]) rttm_uem = rttm_uem[index_keep] - rttm = rttm_uem.drop(columns=['file_tbeg', 'file_tend']) - #print(pd.concat([rttm,rttm2]).drop_duplicates(keep=False).to_string()) + rttm = rttm_uem.drop(columns=["file_tbeg", "file_tend"]) + # print(pd.concat([rttm,rttm2]).drop_duplicates(keep=False).to_string()) return rttm @@ -92,61 +125,68 @@ def read_rttm(rttm_file, uem_file=None, sep=' '): def make_train_segments_from_rttm(df_rttm, min_dur, max_dur): segments = pd.DataFrame() - #vad = pd.DataFrame() + # vad = pd.DataFrame() vad = [] rng = np.random.RandomState(seed=1234) - spk_ids = df_rttm['name'].sort_values().unique() + spk_ids = df_rttm["name"].sort_values().unique() for spk_id in spk_ids: - print('make train segments for spk=%s' % (spk_id)) - index = df_rttm['name'] == spk_id + print("make train segments for spk=%s" % (spk_id)) + index = df_rttm["name"] == spk_id df_rttm_i = df_rttm[index] - file_names = df_rttm_i['file_id'].sort_values().unique() + file_names = df_rttm_i["file_id"].sort_values().unique() for file_name in file_names: - print('\tmake train segments for spk=%s file_name=%s' % (spk_id, file_name)) - index = df_rttm_i['file_id'] == file_name + print("\tmake train segments for spk=%s file_name=%s" % (spk_id, file_name)) + index = df_rttm_i["file_id"] == file_name df_rttm_ij = df_rttm_i[index] - cum_length = np.cumsum(np.asarray(df_rttm_ij['tdur'])) + cum_length = np.cumsum(np.asarray(df_rttm_ij["tdur"])) total_length = cum_length[-1] first_utt = 0 count = 0 - while ( total_length > min_dur ): + while total_length > min_dur: # select number of utterances for this segment cur_dur = min(rng.randint(min_dur, max_dur), total_length) # print('\t\t extract segment %d of length %.2f, remaining length %.2f' % (count, cur_dur, total_length-cur_dur)) - last_utt = np.where(cum_length>=cur_dur)[0][0] - tbeg = df_rttm_ij.iloc[first_utt].tbeg-1 + last_utt = np.where(cum_length >= cur_dur)[0][0] + tbeg = df_rttm_ij.iloc[first_utt].tbeg - 1 tbeg = tbeg if tbeg > 0 else 0 - tend = (df_rttm_ij.iloc[last_utt].tbeg + - df_rttm_ij.iloc[last_utt].tdur) - - #make segment - segment_id = '%s-%s-%07d-%07d' % ( - spk_id, file_name, int(tbeg*100), int(tend*100)) - row = {'segment_id': segment_id, 'filename': file_name, 'speaker': spk_id, - 'beginning_time': tbeg, 'end_time': tend } + tend = df_rttm_ij.iloc[last_utt].tbeg + df_rttm_ij.iloc[last_utt].tdur + + # make segment + segment_id = "%s-%s-%07d-%07d" % ( + spk_id, + file_name, + int(tbeg * 100), + int(tend * 100), + ) + row = { + "segment_id": segment_id, + "filename": file_name, + "speaker": spk_id, + "beginning_time": tbeg, + "end_time": tend, + } segments = segments.append(row, ignore_index=True) - #make vad - df_vad = df_rttm_ij.iloc[first_utt:last_utt+1].copy() - df_vad['file_id'] = segment_id - df_vad['name'] = 'speech' - df_vad['tbeg'] = df_vad['tbeg'] - tbeg + # make vad + df_vad = df_rttm_ij.iloc[first_utt : last_utt + 1].copy() + df_vad["file_id"] = segment_id + df_vad["name"] = "speech" + df_vad["tbeg"] = df_vad["tbeg"] - tbeg vad.append(df_vad) - #vad = pd.concat([vad, df_vad], ignore_index=True) + # vad = pd.concat([vad, df_vad], ignore_index=True) - #update remaining length for current speaker in current audio + # update remaining length for current speaker in current audio cum_length -= cum_length[last_utt] total_length = cum_length[-1] first_utt = last_utt + 1 count += 1 - vad = pd.concat(vad, ignore_index=True) - segments.sort_values('segment_id', inplace=True) - vad.sort_values(['file_id', 'tbeg'], inplace=True) + segments.sort_values("segment_id", inplace=True) + vad.sort_values(["file_id", "tbeg"], inplace=True) return segments, vad - + def segm_vad_to_rttm_vad(segments): @@ -154,9 +194,9 @@ def segm_vad_to_rttm_vad(segments): tbeg = segments.beginning_time tdur = segments.end_time - segments.beginning_time num_segments = len(file_id) - segment_type = ['SPEAKER'] * num_segments + segment_type = ["SPEAKER"] * num_segments - nans = ['' for i in range(num_segments)] + nans = ["" for i in range(num_segments)] chnl = [1 for i in range(num_segments)] ortho = nans stype = nans @@ -164,30 +204,33 @@ def segm_vad_to_rttm_vad(segments): conf = [1 for i in range(num_segments)] slat = nans - df = pd.DataFrame({'segment_type': segment_type, - 'file_id': file_id, - 'chnl': chnl, - 'tbeg': tbeg, - 'tdur': tdur, - 'ortho': ortho, - 'stype': stype, - 'name': name, - 'conf': conf, - 'slat': slat}) - df['name'] = 'speech' + df = pd.DataFrame( + { + "segment_type": segment_type, + "file_id": file_id, + "chnl": chnl, + "tbeg": tbeg, + "tdur": tdur, + "ortho": ortho, + "stype": stype, + "name": name, + "conf": conf, + "slat": slat, + } + ) + df["name"] = "speech" return df - def remove_overlap_from_rttm_vad(rttm): - tbeg_index = rttm.columns.get_indexer(['tbeg']) - tdur_index = rttm.columns.get_indexer(['tdur']) - tend = np.asarray(rttm['tbeg'] + rttm['tdur']) + tbeg_index = rttm.columns.get_indexer(["tbeg"]) + tdur_index = rttm.columns.get_indexer(["tdur"]) + tend = np.asarray(rttm["tbeg"] + rttm["tdur"]) index = np.ones(rttm.shape[0], dtype=bool) p = 0 for i in range(1, rttm.shape[0]): - if rttm['file_id'].iloc[p] == rttm['file_id'].iloc[i]: + if rttm["file_id"].iloc[p] == rttm["file_id"].iloc[i]: if tend[p] > rttm.iloc[i, tbeg_index].item(): index[i] = False if tend[i] > tend[p]: @@ -201,122 +244,146 @@ def remove_overlap_from_rttm_vad(rttm): rttm = rttm.loc[index] return rttm - + def filter_wavs(df_wav, file_names): - df_wav = df_wav.loc[df_wav['key'].isin(file_names)].sort_values('key') + df_wav = df_wav.loc[df_wav["key"].isin(file_names)].sort_values("key") return df_wav def write_wav(df_wav, df_segments, output_path): df_wav.index = df_wav.key - with open(output_path + '/wav.scp', 'w') as f: + with open(output_path + "/wav.scp", "w") as f: for segment_id, file_id, tbeg, tend in zip( - df_segments['segment_id'], df_segments['filename'], - df_segments['beginning_time'], df_segments['end_time']): - file_path = df_wav.loc[file_id,'file_path'] - f.write('%s sox -t flac %s -t wav - trim %.3f =%.3f | \n' % ( - segment_id, file_path, tbeg, tend)) + df_segments["segment_id"], + df_segments["filename"], + df_segments["beginning_time"], + df_segments["end_time"], + ): + file_path = df_wav.loc[file_id, "file_path"] + f.write( + "%s sox -t flac %s -t wav - trim %.3f =%.3f | \n" + % (segment_id, file_path, tbeg, tend) + ) - def write_utt2spk_from_segm(df_seg, output_path): - with open(output_path + '/utt2spk', 'w') as f: - for u,s in zip(df_seg['segment_id'], df_seg['speaker']): - f.write('%s %s\n' % (u, s)) - + with open(output_path + "/utt2spk", "w") as f: + for u, s in zip(df_seg["segment_id"], df_seg["speaker"]): + f.write("%s %s\n" % (u, s)) def write_dummy_utt2spk(file_names, output_path): - - with open(output_path + '/utt2spk', 'w') as f: + + with open(output_path + "/utt2spk", "w") as f: for fn in file_names: - f.write('%s %s\n' % (fn, fn)) + f.write("%s %s\n" % (fn, fn)) def write_segments(df_seg, output_path): - with open(output_path + '/segments', 'w') as f: + with open(output_path + "/segments", "w") as f: for i, row in df_seg.iterrows(): - f.write('%s %s %.2f %.2f\n' % ( - row['segment_id'], row['filename'], - row['beginning_time'], row['end_time'])) + f.write( + "%s %s %.2f %.2f\n" + % ( + row["segment_id"], + row["filename"], + row["beginning_time"], + row["end_time"], + ) + ) - def write_rttm_vad(df_vad, output_path): - file_path = output_path + '/vad.rttm' - df_vad[['segment_type', 'file_id', 'chnl', - 'tbeg','tdur','ortho', 'stype', - 'name', 'conf', 'slat']].to_csv( - file_path, sep=' ', float_format='%.3f', - index=False, header=False) + file_path = output_path + "/vad.rttm" + df_vad[ + [ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ] + ].to_csv(file_path, sep=" ", float_format="%.3f", index=False, header=False) def write_rttm_spk(df_vad, output_path): - file_path = output_path + '/diarization.rttm' - df_vad[['segment_type', 'file_id', 'chnl', - 'tbeg','tdur','ortho', 'stype', - 'name', 'conf', 'slat']].to_csv( - file_path, sep=' ', float_format='%.3f', - index=False, header=False) + file_path = output_path + "/diarization.rttm" + df_vad[ + [ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ] + ].to_csv(file_path, sep=" ", float_format="%.3f", index=False, header=False) - def make_train(df_wav, df_rttm, output_path, min_dur, max_dur): if not os.path.exists(output_path): os.makedirs(output_path) - #make train segments and vad - print('make training segments') + # make train segments and vad + print("make training segments") segments, vad = make_train_segments_from_rttm(df_rttm, min_dur, max_dur) - print('write utt2spk') + print("write utt2spk") write_utt2spk_from_segm(segments, output_path) - + # create wav.scp - print('make wav.scp') + print("make wav.scp") write_wav(df_wav, segments, output_path) - #print('write segments') - #write_segments(segments, output_path) - print('write vad in rttm format') + # print('write segments') + # write_segments(segments, output_path) + print("write vad in rttm format") write_rttm_vad(vad, output_path) - - - - - -def make_dihard_train(rttm_file, uem_file, wav_path, output_path, min_dur, max_dur, data_prefix): - - print('read audios') + + +def make_dihard_train( + rttm_file, uem_file, wav_path, output_path, min_dur, max_dur, data_prefix +): + + print("read audios") df_wav = find_audios(wav_path) - print('read rttm') + print("read rttm") rttm = read_rttm(rttm_file, uem_file) - rttm['name'] = data_prefix + rttm['name'].astype(str) + rttm["name"] = data_prefix + rttm["name"].astype(str) - print('making data directory %s' % (output_path)) + print("making data directory %s" % (output_path)) make_train(df_wav, rttm, output_path, min_dur, max_dur) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Prepares DIHARD data for PLDA/x-vector training') - - parser.add_argument('--rttm', dest='rttm_file', required=True) - parser.add_argument('--uem', dest='uem_file', default=None) - parser.add_argument('--wav-path', dest='wav_path', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--data-prefix', dest='data_prefix', required=True) - parser.add_argument('--min-train-dur', dest='min_dur', default=15, type=float) - parser.add_argument('--max-train-dur', dest='max_dur', default=60, type=float) - - args=parser.parse_args() - + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Prepares DIHARD data for PLDA/x-vector training", + ) + + parser.add_argument("--rttm", dest="rttm_file", required=True) + parser.add_argument("--uem", dest="uem_file", default=None) + parser.add_argument("--wav-path", dest="wav_path", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--data-prefix", dest="data_prefix", required=True) + parser.add_argument("--min-train-dur", dest="min_dur", default=15, type=float) + parser.add_argument("--max-train-dur", dest="max_dur", default=60, type=float) + + args = parser.parse_args() + make_dihard_train(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/make_musan.py b/egs/sre19-cmn2/v1/local/make_musan.py index 8a93a18b..b0ae6846 100755 --- a/egs/sre19-cmn2/v1/local/make_musan.py +++ b/egs/sre19-cmn2/v1/local/make_musan.py @@ -7,125 +7,183 @@ import os, sys + def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals def prepare_music(root_dir, fs, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_speech(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_noise(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def main(): - in_dir = sys.argv[1] - fs = int(sys.argv[2]) - out_dir = sys.argv[3] - use_vocals = sys.argv[4] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/egs/sre19-cmn2/v1/local/make_some_figs1.py b/egs/sre19-cmn2/v1/local/make_some_figs1.py index 94446b43..f292a78d 100755 --- a/egs/sre19-cmn2/v1/local/make_some_figs1.py +++ b/egs/sre19-cmn2/v1/local/make_some_figs1.py @@ -11,185 +11,186 @@ import pandas as pd import matplotlib -matplotlib.use('Agg') -matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) -#matplotlib.rc('text', usetex=True) + +matplotlib.use("Agg") +matplotlib.rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"]}) +# matplotlib.rc('text', usetex=True) import matplotlib.pyplot as plt -output_dir = 'exp/figs/figs_ft1' +output_dir = "exp/figs/figs_ft1" + def plot_loss_vs_epochs(): - net_dir0 = 'exp/xvector_nnets' - net_dir1 = 'resnet34_zir_e256_arc0.3_do0_adam_lr0.01_b512_amp.v2.ft_1000_6000_sgdcos_lr0.05_b128_amp.v2' - net_dir2 = '.ft_eaffine_rege_w%s_1000_6000_sgdcos_lr0.01_b128_amp.v2.ft_reg_wenc%s_we%s_1000_6000_sgdcos_lr0.01_b128_amp.v2' - w = ['0.001','0.01','0.1', '1', '10'] - colors = ['b','--r','-.g','m','--c', '-.k'] + net_dir0 = "exp/xvector_nnets" + net_dir1 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.01_b512_amp.v2.ft_1000_6000_sgdcos_lr0.05_b128_amp.v2" + net_dir2 = ".ft_eaffine_rege_w%s_1000_6000_sgdcos_lr0.01_b128_amp.v2.ft_reg_wenc%s_we%s_1000_6000_sgdcos_lr0.01_b128_amp.v2" + w = ["0.001", "0.01", "0.1", "1", "10"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] df = [] for i in range(len(w)): wi = w[i] - net_dir2i = net_dir2 % (wi,wi,wi) - net_dir = '%s/%s%s' % (net_dir0, net_dir1, net_dir2i) - file_path = net_dir + '/train.log' + net_dir2i = net_dir2 % (wi, wi, wi) + net_dir = "%s/%s%s" % (net_dir0, net_dir1, net_dir2i) + file_path = net_dir + "/train.log" df_i = pd.read_csv(file_path) df.append(df_i) - cols_h = ['reg-h-enc-0','reg-h-enc-1','reg-h-enc-2','reg-h-enc-3','reg-h-enc-4'] - col_e = 'reg-h-classif-0' - col_cxe = 'loss-classif' - col_val_cxe = 'val_loss' - + cols_h = ["reg-h-enc-0", "reg-h-enc-1", "reg-h-enc-2", "reg-h-enc-3", "reg-h-enc-4"] + col_e = "reg-h-classif-0" + col_cxe = "loss-classif" + col_val_cxe = "val_loss" + plt.figure() for i in range(len(df)): df_i = df[i] m1 = df_i[cols_h].mean(axis=1) + df_i[col_e] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = m1.values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('L1 regularization loss') - plt.xlabel('num. epochs') + plt.ylabel("L1 regularization loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/lreg_vs_epochs.pdf') + plt.savefig(output_dir + "/lreg_vs_epochs.pdf") plt.close() - colors = ['b','r','g','m','c','k'] + colors = ["b", "r", "g", "m", "c", "k"] plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_cxe].values y_val = df_i[col_val_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] y_val = y_val[kk] - plt.plot(x, y, colors[i], label='train-cxe w=%s' % (w[i])) - plt.plot(x, y_val, '--' + colors[i], label='val-cxe w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="train-cxe w=%s" % (w[i])) + plt.plot(x, y_val, "--" + colors[i], label="val-cxe w=%s" % (w[i])) - plt.ylabel('cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/cxe_vs_epochs.pdf") plt.close() - plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['acc'].values - y_val = df_i['val_acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["acc"].values + y_val = df_i["val_acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - y_val = y_val[kk]*100 - plt.plot(x, y, colors[i], label='train-acc w=%s' % (w[i])) - plt.plot(x, y_val, '--' + colors[i], label='val-acc w=%s' % (w[i])) + y = y[kk] * 100 + y_val = y_val[kk] * 100 + plt.plot(x, y, colors[i], label="train-acc w=%s" % (w[i])) + plt.plot(x, y_val, "--" + colors[i], label="val-acc w=%s" % (w[i])) - plt.ylabel('Accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("Accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/acc_vs_epochs.pdf') + plt.savefig(output_dir + "/acc_vs_epochs.pdf") plt.close() - colors = ['b','--r','-.g','m','--c', '-.k'] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('train cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("train cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/train_cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/train_cxe_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_val_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('val. cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("val. cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/val_cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/val_cxe_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + y = y[kk] * 100 + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('train accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("train accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/train_acc_vs_epochs.pdf') + plt.savefig(output_dir + "/train_acc_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['val_acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["val_acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + y = y[kk] * 100 + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('val accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("val accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/val_acc_vs_epochs.pdf') + plt.savefig(output_dir + "/val_acc_vs_epochs.pdf") plt.close() def plot_perf_vs_iter_w(): - w = ['0.001','0.01','0.1', '1', '10'] - be = ['be1','be2','be3'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] + w = ["0.001", "0.01", "0.1", "1", "10"] + be = ["be1", "be2", "be3"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] for i in range(len(be)): df = [] for j in range(len(w)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'] + xlabels = df[0]["system"] for k in range(len(dbs)): title_k = titles[k] dbk = dbs[k] @@ -197,140 +198,138 @@ def plot_perf_vs_iter_w(): for j in range(len(w)): df_j = df[j] - - y = df_j[dbk + '-eer'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-eer"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) locs, _ = plt.xticks() plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_eer_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_eer_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(w)): df_j = df[j] - - y = df_j[dbk + '-min-dcf'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-min-dcf"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) locs, _ = plt.xticks() plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(w)): df_j = df[j] - y = df_j[dbk + '-act-dcf'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + y = df_j[dbk + "-act-dcf"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) locs, _ = plt.xticks() plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() - def plot_perf_vs_iter_be(): - w = ['0.001','0.01','0.1', '1', '10'] - w = ['1'] - be = ['be1','be2','be3'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] - + w = ["0.001", "0.01", "0.1", "1", "10"] + w = ["1"] + be = ["be1", "be2", "be3"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] + for j in range(len(w)): df = [] - for i in range(len(be)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + for i in range(len(be)): + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'] + xlabels = df[0]["system"] for k in range(len(dbs)): title_k = titles[k] dbk = dbs[k] plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-eer'] + + y = df_i[dbk + "-eer"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) locs, _ = plt.xticks() plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_eer_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_eer_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-min-dcf'] + + y = df_i[dbk + "-min-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) locs, _ = plt.xticks() plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - y = df_i[dbk + '-act-dcf'] + y = df_i[dbk + "-act-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) locs, _ = plt.xticks() plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() - if __name__ == "__main__": if not os.path.isdir(output_dir): diff --git a/egs/sre19-cmn2/v1/local/make_some_figs2.py b/egs/sre19-cmn2/v1/local/make_some_figs2.py index 96104cdb..eb1f8ccd 100755 --- a/egs/sre19-cmn2/v1/local/make_some_figs2.py +++ b/egs/sre19-cmn2/v1/local/make_some_figs2.py @@ -11,185 +11,186 @@ import pandas as pd import matplotlib -matplotlib.use('Agg') -matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) -#matplotlib.rc('text', usetex=True) + +matplotlib.use("Agg") +matplotlib.rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"]}) +# matplotlib.rc('text', usetex=True) import matplotlib.pyplot as plt -output_dir = 'exp/figs/figs_ft2' +output_dir = "exp/figs/figs_ft2" + def plot_loss_vs_epochs(): - net_dir0 = 'exp/xvector_nnets' - net_dir1 = 'resnet34_zir_e256_arc0.3_do0_adam_lr0.01_b512_amp.v2.ft_1000_6000_sgdcos_lr0.05_b128_amp.v2' - net_dir2 = '.ft_eaffine_rege_w%s_1000_6000_sgdcos_lr0.01_b128_amp.v2.ft_reg_wenc%s_we%s_1000_6000_sgdcos_lr0.01_b128_amp.v2' - w = ['0.001','0.01','0.1', '1', '10'] - colors = ['b','--r','-.g','m','--c', '-.k'] + net_dir0 = "exp/xvector_nnets" + net_dir1 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.01_b512_amp.v2.ft_1000_6000_sgdcos_lr0.05_b128_amp.v2" + net_dir2 = ".ft_eaffine_rege_w%s_1000_6000_sgdcos_lr0.01_b128_amp.v2.ft_reg_wenc%s_we%s_1000_6000_sgdcos_lr0.01_b128_amp.v2" + w = ["0.001", "0.01", "0.1", "1", "10"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] df = [] for i in range(len(w)): wi = w[i] - net_dir2i = net_dir2 % (wi,wi,wi) - net_dir = '%s/%s%s' % (net_dir0, net_dir1, net_dir2i) - file_path = net_dir + '/train.log' + net_dir2i = net_dir2 % (wi, wi, wi) + net_dir = "%s/%s%s" % (net_dir0, net_dir1, net_dir2i) + file_path = net_dir + "/train.log" df_i = pd.read_csv(file_path) df.append(df_i) - cols_h = ['reg-h-enc-0','reg-h-enc-1','reg-h-enc-2','reg-h-enc-3','reg-h-enc-4'] - col_e = 'reg-h-classif-0' - col_cxe = 'loss-classif' - col_val_cxe = 'val_loss' - + cols_h = ["reg-h-enc-0", "reg-h-enc-1", "reg-h-enc-2", "reg-h-enc-3", "reg-h-enc-4"] + col_e = "reg-h-classif-0" + col_cxe = "loss-classif" + col_val_cxe = "val_loss" + plt.figure() for i in range(len(df)): df_i = df[i] m1 = df_i[cols_h].mean(axis=1) + df_i[col_e] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = m1.values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('L1 regularization loss') - plt.xlabel('num. epochs') + plt.ylabel("L1 regularization loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/lreg_vs_epochs.pdf') + plt.savefig(output_dir + "/lreg_vs_epochs.pdf") plt.close() - colors = ['b','r','g','m','c','k'] + colors = ["b", "r", "g", "m", "c", "k"] plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_cxe].values y_val = df_i[col_val_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] y_val = y_val[kk] - plt.plot(x, y, colors[i], label='train-cxe w=%s' % (w[i])) - plt.plot(x, y_val, '--' + colors[i], label='val-cxe w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="train-cxe w=%s" % (w[i])) + plt.plot(x, y_val, "--" + colors[i], label="val-cxe w=%s" % (w[i])) - plt.ylabel('cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/cxe_vs_epochs.pdf") plt.close() - plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['acc'].values - y_val = df_i['val_acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["acc"].values + y_val = df_i["val_acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - y_val = y_val[kk]*100 - plt.plot(x, y, colors[i], label='train-acc w=%s' % (w[i])) - plt.plot(x, y_val, '--' + colors[i], label='val-acc w=%s' % (w[i])) + y = y[kk] * 100 + y_val = y_val[kk] * 100 + plt.plot(x, y, colors[i], label="train-acc w=%s" % (w[i])) + plt.plot(x, y_val, "--" + colors[i], label="val-acc w=%s" % (w[i])) - plt.ylabel('Accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("Accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/acc_vs_epochs.pdf') + plt.savefig(output_dir + "/acc_vs_epochs.pdf") plt.close() - colors = ['b','--r','-.g','m','--c', '-.k'] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('train cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("train cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/train_cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/train_cxe_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_val_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('val. cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("val. cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/val_cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/val_cxe_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + y = y[kk] * 100 + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('train accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("train accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/train_acc_vs_epochs.pdf') + plt.savefig(output_dir + "/train_acc_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['val_acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["val_acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + y = y[kk] * 100 + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('val accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("val accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/val_acc_vs_epochs.pdf') + plt.savefig(output_dir + "/val_acc_vs_epochs.pdf") plt.close() def plot_perf_vs_iter_w(): - w = ['0.001','0.01','0.1', '1', '10'] - be = ['be1','be2','be3', 'be1-snorm', 'be2-snorm','be3-snorm'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] + w = ["0.001", "0.01", "0.1", "1", "10"] + be = ["be1", "be2", "be3", "be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] for i in range(len(be)): df = [] for j in range(len(w)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] @@ -198,211 +199,209 @@ def plot_perf_vs_iter_w(): for j in range(len(w)): df_j = df[j] - - y = df_j[dbk + '-eer'].values - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-eer"].values + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_eer_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_eer_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(w)): df_j = df[j] - - y = df_j[dbk + '-min-dcf'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-min-dcf"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(w)): df_j = df[j] - y = df_j[dbk + '-act-dcf'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + y = df_j[dbk + "-act-dcf"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() - def plot_perf_vs_iter_be1(): - w = ['0.001','0.01','0.1', '1', '10'] - w = ['1'] - be = ['be1-snorm','be2-snorm','be3-snorm'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] - + w = ["0.001", "0.01", "0.1", "1", "10"] + w = ["1"] + be = ["be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] + for j in range(len(w)): df = [] - for i in range(len(be)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + for i in range(len(be)): + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] dbk = dbs[k] plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-eer'] + + y = df_i[dbk + "-eer"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_eer_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_eer_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-min-dcf'] + + y = df_i[dbk + "-min-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - y = df_i[dbk + '-act-dcf'] + y = df_i[dbk + "-act-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() def plot_perf_vs_iter_be2(): - w = ['0.001','0.01','0.1', '1', '10'] - w = ['1'] - be = ['be1','be2','be3', 'be1-snorm','be2-snorm','be3-snorm'] - colors = ['b','r','g','--b','--r', '--g'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] - + w = ["0.001", "0.01", "0.1", "1", "10"] + w = ["1"] + be = ["be1", "be2", "be3", "be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "r", "g", "--b", "--r", "--g"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] + for j in range(len(w)): df = [] - for i in range(len(be)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + for i in range(len(be)): + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] dbk = dbs[k] plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-eer'] + + y = df_i[dbk + "-eer"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_eer_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_eer_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-min-dcf'] + + y = df_i[dbk + "-min-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - y = df_i[dbk + '-act-dcf'] + y = df_i[dbk + "-act-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() - if __name__ == "__main__": if not os.path.isdir(output_dir): diff --git a/egs/sre19-cmn2/v1/local/make_some_figs3.py b/egs/sre19-cmn2/v1/local/make_some_figs3.py index fe0245e7..c63e2383 100755 --- a/egs/sre19-cmn2/v1/local/make_some_figs3.py +++ b/egs/sre19-cmn2/v1/local/make_some_figs3.py @@ -11,185 +11,186 @@ import pandas as pd import matplotlib -matplotlib.use('Agg') -matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) -#matplotlib.rc('text', usetex=True) + +matplotlib.use("Agg") +matplotlib.rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"]}) +# matplotlib.rc('text', usetex=True) import matplotlib.pyplot as plt -output_dir = 'exp/figs/figs_ft3' +output_dir = "exp/figs/figs_ft3" + def plot_loss_vs_epochs(): - net_dir0 = 'exp/xvector_nnets' - net_dir1 = 'resnet34_zir_e256_arc0.3_do0_adam_lr0.01_b512_amp.v2.ft_1000_6000_sgdcos_lr0.05_b128_amp.v2' - net_dir2 = '.ft_eaffine_rege_w%s_1000_6000_sgdcos_lr0.01_b128_amp.v2.ft_reg_wenc%s_we%s_1000_6000_sgdcos_lr0.01_b128_amp.v2' - w = ['0.001','0.01','0.1', '1', '10'] - colors = ['b','--r','-.g','m','--c', '-.k'] + net_dir0 = "exp/xvector_nnets" + net_dir1 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.01_b512_amp.v2.ft_1000_6000_sgdcos_lr0.05_b128_amp.v2" + net_dir2 = ".ft_eaffine_rege_w%s_1000_6000_sgdcos_lr0.01_b128_amp.v2.ft_reg_wenc%s_we%s_1000_6000_sgdcos_lr0.01_b128_amp.v2" + w = ["0.001", "0.01", "0.1", "1", "10"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] df = [] for i in range(len(w)): wi = w[i] - net_dir2i = net_dir2 % (wi,wi,wi) - net_dir = '%s/%s%s' % (net_dir0, net_dir1, net_dir2i) - file_path = net_dir + '/train.log' + net_dir2i = net_dir2 % (wi, wi, wi) + net_dir = "%s/%s%s" % (net_dir0, net_dir1, net_dir2i) + file_path = net_dir + "/train.log" df_i = pd.read_csv(file_path) df.append(df_i) - cols_h = ['reg-h-enc-0','reg-h-enc-1','reg-h-enc-2','reg-h-enc-3','reg-h-enc-4'] - col_e = 'reg-h-classif-0' - col_cxe = 'loss-classif' - col_val_cxe = 'val_loss' - + cols_h = ["reg-h-enc-0", "reg-h-enc-1", "reg-h-enc-2", "reg-h-enc-3", "reg-h-enc-4"] + col_e = "reg-h-classif-0" + col_cxe = "loss-classif" + col_val_cxe = "val_loss" + plt.figure() for i in range(len(df)): df_i = df[i] m1 = df_i[cols_h].mean(axis=1) + df_i[col_e] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = m1.values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('L1 regularization loss') - plt.xlabel('num. epochs') + plt.ylabel("L1 regularization loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/lreg_vs_epochs.pdf') + plt.savefig(output_dir + "/lreg_vs_epochs.pdf") plt.close() - colors = ['b','r','g','m','c','k'] + colors = ["b", "r", "g", "m", "c", "k"] plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_cxe].values y_val = df_i[col_val_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] y_val = y_val[kk] - plt.plot(x, y, colors[i], label='train-cxe w=%s' % (w[i])) - plt.plot(x, y_val, '--' + colors[i], label='val-cxe w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="train-cxe w=%s" % (w[i])) + plt.plot(x, y_val, "--" + colors[i], label="val-cxe w=%s" % (w[i])) - plt.ylabel('cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/cxe_vs_epochs.pdf") plt.close() - plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['acc'].values - y_val = df_i['val_acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["acc"].values + y_val = df_i["val_acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - y_val = y_val[kk]*100 - plt.plot(x, y, colors[i], label='train-acc w=%s' % (w[i])) - plt.plot(x, y_val, '--' + colors[i], label='val-acc w=%s' % (w[i])) + y = y[kk] * 100 + y_val = y_val[kk] * 100 + plt.plot(x, y, colors[i], label="train-acc w=%s" % (w[i])) + plt.plot(x, y_val, "--" + colors[i], label="val-acc w=%s" % (w[i])) - plt.ylabel('Accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("Accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/acc_vs_epochs.pdf') + plt.savefig(output_dir + "/acc_vs_epochs.pdf") plt.close() - colors = ['b','--r','-.g','m','--c', '-.k'] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('train cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("train cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/train_cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/train_cxe_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) + x = df_i["epoch"].values.astype(np.int) y = df_i[col_val_cxe].values - kk = x<=33 + kk = x <= 33 x = x[kk] y = y[kk] - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('val. cross-entropy loss') - plt.xlabel('num. epochs') + plt.ylabel("val. cross-entropy loss") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/val_cxe_vs_epochs.pdf') + plt.savefig(output_dir + "/val_cxe_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + y = y[kk] * 100 + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('train accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("train accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/train_acc_vs_epochs.pdf') + plt.savefig(output_dir + "/train_acc_vs_epochs.pdf") plt.close() plt.figure() for i in range(len(df)): df_i = df[i] - x = df_i['epoch'].values.astype(np.int) - y = df_i['val_acc'].values - kk = x<=33 + x = df_i["epoch"].values.astype(np.int) + y = df_i["val_acc"].values + kk = x <= 33 x = x[kk] - y = y[kk]*100 - plt.plot(x, y, colors[i], label='w=%s' % (w[i])) + y = y[kk] * 100 + plt.plot(x, y, colors[i], label="w=%s" % (w[i])) - plt.ylabel('val accuracy (%)') - plt.xlabel('num. epochs') + plt.ylabel("val accuracy (%)") + plt.xlabel("num. epochs") plt.grid() plt.legend() - plt.savefig(output_dir + '/val_acc_vs_epochs.pdf') + plt.savefig(output_dir + "/val_acc_vs_epochs.pdf") plt.close() def plot_perf_vs_iter_w(): - w = ['0.001','0.01','0.1', '1', '10'] - be = ['be1','be2','be3', 'be1-snorm', 'be2-snorm','be3-snorm'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] + w = ["0.001", "0.01", "0.1", "1", "10"] + be = ["be1", "be2", "be3", "be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] for i in range(len(be)): df = [] for j in range(len(w)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] @@ -198,76 +199,74 @@ def plot_perf_vs_iter_w(): for j in range(len(w)): df_j = df[j] - - y = df_j[dbk + '-eer'].values - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-eer"].values + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_eer_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_eer_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(w)): df_j = df[j] - - y = df_j[dbk + '-min-dcf'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-min-dcf"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(w)): df_j = df[j] - y = df_j[dbk + '-act-dcf'] - plt.plot(y, colors[j], label='w=%s' % (w[j])) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + y = df_j[dbk + "-act-dcf"] + plt.plot(y, colors[j], label="w=%s" % (w[j])) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() - - def plot_perf_vs_iter_nnet(): - nnet_nb = ['0','1','2'] - nnet_name = ['ResNet34', 'SE-ResNet34', 'TSE-ResNet34'] - be = ['be1','be2','be3', 'be1-snorm', 'be2-snorm','be3-snorm'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] + nnet_nb = ["0", "1", "2"] + nnet_name = ["ResNet34", "SE-ResNet34", "TSE-ResNet34"] + be = ["be1", "be2", "be3", "be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] for i in range(len(be)): df = [] for j in range(len(nnet_nb)): - in_file = '%s/table1_nnet%s_%s.csv' % (output_dir, nnet_nb[j], be[i]) + in_file = "%s/table1_nnet%s_%s.csv" % (output_dir, nnet_nb[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] @@ -276,217 +275,219 @@ def plot_perf_vs_iter_nnet(): for j in range(len(nnet_nb)): df_j = df[j] - - y = df_j[dbk + '-eer'].values - plt.plot(y, colors[j], label='%s' % (nnet_name[j])) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-eer"].values + plt.plot(y, colors[j], label="%s" % (nnet_name[j])) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_nnets_%s_eer_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig("%s/%s_nnets_%s_eer_vs_epochs.pdf" % (output_dir, dbk, be[i])) plt.close() plt.figure() for j in range(len(nnet_nb)): df_j = df[j] - - y = df_j[dbk + '-min-dcf'] - plt.plot(y, colors[j], label='%s' % (nnet_name[j])) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + y = df_j[dbk + "-min-dcf"] + plt.plot(y, colors[j], label="%s" % (nnet_name[j])) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_nnets_%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig( + "%s/%s_nnets_%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, be[i]) + ) plt.close() plt.figure() for j in range(len(nnet_nb)): df_j = df[j] - y = df_j[dbk + '-act-dcf'] - plt.plot(y, colors[j], label='%s' % (nnet_name[j])) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + y = df_j[dbk + "-act-dcf"] + plt.plot(y, colors[j], label="%s" % (nnet_name[j])) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_nnets_%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, be[i])) + plt.savefig( + "%s/%s_nnets_%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, be[i]) + ) plt.close() - def plot_perf_vs_iter_be1(): - w = ['0.001','0.01','0.1', '1', '10'] - w = ['1'] - be = ['be1-snorm','be2-snorm','be3-snorm'] - colors = ['b','--r','-.g','m','--c', '-.k'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] - + w = ["0.001", "0.01", "0.1", "1", "10"] + w = ["1"] + be = ["be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "--r", "-.g", "m", "--c", "-.k"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] + for j in range(len(w)): df = [] - for i in range(len(be)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + for i in range(len(be)): + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] dbk = dbs[k] plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-eer'] + + y = df_i[dbk + "-eer"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_eer_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_eer_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-min-dcf'] + + y = df_i[dbk + "-min-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - y = df_i[dbk + '-act-dcf'] + y = df_i[dbk + "-act-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() def plot_perf_vs_iter_be2(): - w = ['0.001','0.01','0.1', '1', '10'] - w = ['1'] - be = ['be1','be2','be3', 'be1-snorm','be2-snorm','be3-snorm'] - colors = ['b','r','g','--b','--r', '--g'] - dbs = ['sre18', 'sre19p', 'sre19e'] - titles = ['SRE18 Eval40%', 'SRE19-Prog', 'SRE19-Eval'] - + w = ["0.001", "0.01", "0.1", "1", "10"] + w = ["1"] + be = ["be1", "be2", "be3", "be1-snorm", "be2-snorm", "be3-snorm"] + colors = ["b", "r", "g", "--b", "--r", "--g"] + dbs = ["sre18", "sre19p", "sre19e"] + titles = ["SRE18 Eval40%", "SRE19-Prog", "SRE19-Eval"] + for j in range(len(w)): df = [] - for i in range(len(be)): - in_file = '%s/table1_w%s_%s.csv' % (output_dir, w[j], be[i]) + for i in range(len(be)): + in_file = "%s/table1_w%s_%s.csv" % (output_dir, w[j], be[i]) df.append(pd.read_csv(in_file, index_col=False)) - xlabels = df[0]['system'].values + xlabels = df[0]["system"].values locs = [l for l in range(len(xlabels))] for k in range(len(dbs)): title_k = titles[k] dbk = dbs[k] plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-eer'] + + y = df_i[dbk + "-eer"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('EER(%)') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("EER(%)") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_eer_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_eer_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - - y = df_i[dbk + '-min-dcf'] + + y = df_i[dbk + "-min-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('MinCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("MinCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_mindcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_mindcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() plt.figure() - for i in range(len(be)): + for i in range(len(be)): df_i = df[i] - y = df_i[dbk + '-act-dcf'] + y = df_i[dbk + "-act-dcf"] plt.plot(y, colors[i], label=be[i]) - - plt.ylabel('ActCprimary') - plt.xlabel('model') - plt.xlim(0, len(y)-1) + + plt.ylabel("ActCprimary") + plt.xlabel("model") + plt.xlim(0, len(y) - 1) plt.xticks(locs, xlabels, rotation=10, fontsize=8) plt.grid() plt.legend() plt.title(title_k) plt.tight_layout() - plt.savefig('%s/%s_w%s_actdcf_vs_epochs.pdf' % (output_dir, dbk, w[j])) + plt.savefig("%s/%s_w%s_actdcf_vs_epochs.pdf" % (output_dir, dbk, w[j])) plt.close() - if __name__ == "__main__": if not os.path.isdir(output_dir): os.makedirs(output_dir) - #plot_loss_vs_epochs() - #plot_perf_vs_iter_w() - #plot_perf_vs_iter_be2() + # plot_loss_vs_epochs() + # plot_perf_vs_iter_w() + # plot_perf_vs_iter_be2() plot_perf_vs_iter_nnet() diff --git a/egs/sre19-cmn2/v1/local/plot_scores_sre19av.py b/egs/sre19-cmn2/v1/local/plot_scores_sre19av.py index e2922e40..874d0bd5 100755 --- a/egs/sre19-cmn2/v1/local/plot_scores_sre19av.py +++ b/egs/sre19-cmn2/v1/local/plot_scores_sre19av.py @@ -5,9 +5,6 @@ """ - - - import sys import os import argparse @@ -16,7 +13,8 @@ import numpy as np from scipy.stats import mode import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.utils.trial_scores import TrialScores @@ -24,28 +22,40 @@ from hyperion.utils.trial_key import TrialKey -def gauss_map_adapt(mu,sigma,mu0,sigma0,N,r_mu,r_s2): +def gauss_map_adapt(mu, sigma, mu0, sigma0, N, r_mu, r_s2): - s2 = sigma**2 - s02 = sigma0**2 + s2 = sigma ** 2 + s02 = sigma0 ** 2 - alpha_mu = N/(N+r_mu) - alpha_s2 = N/(N+r_mu) - mu_map = alpha_mu * mu + (1-alpha_mu)*mu0 - s2_map = alpha_s2*s2+(1-alpha_s2)*s02 + alpha_s2*(1-alpha_mu)*(mu-mu0)**2 + alpha_mu = N / (N + r_mu) + alpha_s2 = N / (N + r_mu) + mu_map = alpha_mu * mu + (1 - alpha_mu) * mu0 + s2_map = ( + alpha_s2 * s2 + + (1 - alpha_s2) * s02 + + alpha_s2 * (1 - alpha_mu) * (mu - mu0) ** 2 + ) return mu_map, np.sqrt(s2_map) def plot_scores_sre18( - key_sitw_core, scores_sitw_core, - key_sitw_core_multi, scores_sitw_core_multi, - key_sre18_eval, scores_sre18_eval, - key_sre19_dev, scores_sre19_dev, - key_sre19_eval, scores_sre19_eval, - key_janus_dev, scores_janus_dev, - key_janus_eval, scores_janus_eval, - name, - output_path): + key_sitw_core, + scores_sitw_core, + key_sitw_core_multi, + scores_sitw_core_multi, + key_sre18_eval, + scores_sre18_eval, + key_sre19_dev, + scores_sre19_dev, + key_sre19_eval, + scores_sre19_eval, + key_janus_dev, + scores_janus_dev, + key_janus_eval, + scores_janus_eval, + name, + output_path, +): if not os.path.exists(output_path): os.makedirs(ouput_path) @@ -58,7 +68,7 @@ def plot_scores_sre18( scr_sre18_eval = TrialScores.load_txt(scores_sre18_eval) k_sre19_dev = TrialKey.load_txt(key_sre19_dev) scr_sre19_dev = TrialScores.load_txt(scores_sre19_dev) - #k_sre19_eval = TrialKey.load_txt(key_sre19_eval) + # k_sre19_eval = TrialKey.load_txt(key_sre19_eval) k_sre19_eval = TrialNdx.load_txt(key_sre19_eval) scr_sre19_eval = TrialScores.load_txt(scores_sre19_eval) k_janus_dev = TrialKey.load_txt(key_janus_dev) @@ -67,97 +77,194 @@ def plot_scores_sre18( scr_janus_eval = TrialScores.load_txt(scores_janus_eval) tar_sitw_core, non_sitw_core = scr_sitw_core.get_tar_non(k_sitw_core) - tar_sitw_core_multi, non_sitw_core_multi = scr_sitw_core_multi.get_tar_non(k_sitw_core_multi) + tar_sitw_core_multi, non_sitw_core_multi = scr_sitw_core_multi.get_tar_non( + k_sitw_core_multi + ) tar_sre18_eval, non_sre18_eval = scr_sre18_eval.get_tar_non(k_sre18_eval) tar_sre19_dev, non_sre19_dev = scr_sre19_dev.get_tar_non(k_sre19_dev) - #tar_sre19_eval, non_sre19_eval = scr_sre19_eval.get_tar_non(k_sre19_eval) + # tar_sre19_eval, non_sre19_eval = scr_sre19_eval.get_tar_non(k_sre19_eval) scr_sre19_eval = scr_sre19_eval.align_with_ndx(k_sre19_eval) non_sre19_eval = scr_sre19_eval.scores[k_sre19_eval.trial_mask] tar_janus_dev, non_janus_dev = scr_janus_dev.get_tar_non(k_janus_dev) tar_janus_eval, non_janus_eval = scr_janus_eval.get_tar_non(k_janus_eval) - p=0.05 - thr=-np.log(p/(1-p)) - - - plt.hist(tar_sitw_core, 80, histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5, label='SITW eval core-core') - plt.hist(non_sitw_core, 1000, histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5) - plt.hist(tar_sitw_core_multi, 80, histtype='step', density=True, color='r', - linestyle='solid', linewidth=1.5, label='SITW eval core-multi') - plt.hist(non_sitw_core_multi, 1000, histtype='step', density=True, color='r', - linestyle='solid', linewidth=1.5) - plt.hist(tar_sre18_eval, 15, histtype='step', density=True, color='g', - linestyle='solid', linewidth=1.5, label='SRE18 eval VAST') - plt.hist(non_sre18_eval, 200, histtype='step', density=True, color='g', - linestyle='solid', linewidth=1.5) - plt.hist(tar_sre19_dev, 5, histtype='step', density=True, color='c', - linestyle='solid', linewidth=1.5, label='SRE19 dev AV') - plt.hist(non_sre19_dev, 100, histtype='step', density=True, color='c', - linestyle='solid', linewidth=1.5) - plt.hist(tar_janus_dev, 10, histtype='step', density=True, color='y', - linestyle='solid', linewidth=1.5, label='JANUS dev CORE') - plt.hist(non_janus_dev, 200, histtype='step', density=True, color='y', - linestyle='solid', linewidth=1.5) - plt.hist(tar_janus_eval, 30, histtype='step', density=True, color='m', - linestyle='solid', linewidth=1.5, label='JANUS eval CORE') - plt.hist(non_janus_eval, 500, histtype='step', density=True, color='m', - linestyle='solid', linewidth=1.5) - - - - plt.axvline(x=thr, color='k') - #plt.title(name) - plt.xlabel('LLR score') + p = 0.05 + thr = -np.log(p / (1 - p)) + + plt.hist( + tar_sitw_core, + 80, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + label="SITW eval core-core", + ) + plt.hist( + non_sitw_core, + 1000, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_sitw_core_multi, + 80, + histtype="step", + density=True, + color="r", + linestyle="solid", + linewidth=1.5, + label="SITW eval core-multi", + ) + plt.hist( + non_sitw_core_multi, + 1000, + histtype="step", + density=True, + color="r", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_sre18_eval, + 15, + histtype="step", + density=True, + color="g", + linestyle="solid", + linewidth=1.5, + label="SRE18 eval VAST", + ) + plt.hist( + non_sre18_eval, + 200, + histtype="step", + density=True, + color="g", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_sre19_dev, + 5, + histtype="step", + density=True, + color="c", + linestyle="solid", + linewidth=1.5, + label="SRE19 dev AV", + ) + plt.hist( + non_sre19_dev, + 100, + histtype="step", + density=True, + color="c", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_janus_dev, + 10, + histtype="step", + density=True, + color="y", + linestyle="solid", + linewidth=1.5, + label="JANUS dev CORE", + ) + plt.hist( + non_janus_dev, + 200, + histtype="step", + density=True, + color="y", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_janus_eval, + 30, + histtype="step", + density=True, + color="m", + linestyle="solid", + linewidth=1.5, + label="JANUS eval CORE", + ) + plt.hist( + non_janus_eval, + 500, + histtype="step", + density=True, + color="m", + linestyle="solid", + linewidth=1.5, + ) + + plt.axvline(x=thr, color="k") + # plt.title(name) + plt.xlabel("LLR score") plt.grid(True) plt.legend() plt.show() - plt.savefig(output_path + '/hist_vid0.png') + plt.savefig(output_path + "/hist_vid0.png") # plt.hist(tar_sre19_eval, 20, histtype='step', density=True, color='k', # linestyle='solid', linewidth=1.5, label='SRE19 eval AV') - plt.hist(non_sre19_eval, 200, histtype='step', density=True, color='k', - linestyle='solid', linewidth=1.5, label='SRE19 eval AV') - - plt.axvline(x=thr, color='k') + plt.hist( + non_sre19_eval, + 200, + histtype="step", + density=True, + color="k", + linestyle="solid", + linewidth=1.5, + label="SRE19 eval AV", + ) + + plt.axvline(x=thr, color="k") plt.grid(True) plt.legend() plt.show() - plt.savefig(output_path + '/hist_vid.png') + plt.savefig(output_path + "/hist_vid.png") plt.clf() - - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Plots hist scores for sre19 av') - - parser.add_argument('--key-sitw-core', dest='key_sitw_core', required=True) - parser.add_argument('--scores-sitw-core', dest='scores_sitw_core', required=True) - parser.add_argument('--key-sitw-core-multi', dest='key_sitw_core_multi', required=True) - parser.add_argument('--scores-sitw-core-multi', dest='scores_sitw_core_multi', required=True) - parser.add_argument('--key-sre18-eval', dest='key_sre18_eval', required=True) - parser.add_argument('--scores-sre18-eval', dest='scores_sre18_eval', required=True) - parser.add_argument('--key-sre19-dev', dest='key_sre19_dev', required=True) - parser.add_argument('--scores-sre19-dev', dest='scores_sre19_dev', required=True) - parser.add_argument('--key-sre19-eval', dest='key_sre19_eval', required=True) - parser.add_argument('--scores-sre19-eval', dest='scores_sre19_eval', required=True) - parser.add_argument('--key-janus-dev', dest='key_janus_dev', required=True) - parser.add_argument('--scores-janus-dev', dest='scores_janus_dev', required=True) - parser.add_argument('--key-janus-eval', dest='key_janus_eval', required=True) - parser.add_argument('--scores-janus-eval', dest='scores_janus_eval', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--name', dest='name', default='') - - args=parser.parse_args() - - plot_scores_sre18(**vars(args)) + fromfile_prefix_chars="@", + description="Plots hist scores for sre19 av", + ) + + parser.add_argument("--key-sitw-core", dest="key_sitw_core", required=True) + parser.add_argument("--scores-sitw-core", dest="scores_sitw_core", required=True) + parser.add_argument( + "--key-sitw-core-multi", dest="key_sitw_core_multi", required=True + ) + parser.add_argument( + "--scores-sitw-core-multi", dest="scores_sitw_core_multi", required=True + ) + parser.add_argument("--key-sre18-eval", dest="key_sre18_eval", required=True) + parser.add_argument("--scores-sre18-eval", dest="scores_sre18_eval", required=True) + parser.add_argument("--key-sre19-dev", dest="key_sre19_dev", required=True) + parser.add_argument("--scores-sre19-dev", dest="scores_sre19_dev", required=True) + parser.add_argument("--key-sre19-eval", dest="key_sre19_eval", required=True) + parser.add_argument("--scores-sre19-eval", dest="scores_sre19_eval", required=True) + parser.add_argument("--key-janus-dev", dest="key_janus_dev", required=True) + parser.add_argument("--scores-janus-dev", dest="scores_janus_dev", required=True) + parser.add_argument("--key-janus-eval", dest="key_janus_eval", required=True) + parser.add_argument("--scores-janus-eval", dest="scores_janus_eval", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--name", dest="name", default="") - + args = parser.parse_args() + + plot_scores_sre18(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/plot_scores_sre19av_sre18-9.py b/egs/sre19-cmn2/v1/local/plot_scores_sre19av_sre18-9.py index 6fdc89db..d42f32d4 100755 --- a/egs/sre19-cmn2/v1/local/plot_scores_sre19av_sre18-9.py +++ b/egs/sre19-cmn2/v1/local/plot_scores_sre19av_sre18-9.py @@ -5,9 +5,6 @@ """ - - - import sys import os import argparse @@ -16,7 +13,8 @@ import numpy as np from scipy.stats import mode import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.utils.trial_scores import TrialScores @@ -24,24 +22,32 @@ from hyperion.utils.trial_key import TrialKey -def gauss_map_adapt(mu,sigma,mu0,sigma0,N,r_mu,r_s2): +def gauss_map_adapt(mu, sigma, mu0, sigma0, N, r_mu, r_s2): - s2 = sigma**2 - s02 = sigma0**2 + s2 = sigma ** 2 + s02 = sigma0 ** 2 - alpha_mu = N/(N+r_mu) - alpha_s2 = N/(N+r_mu) - mu_map = alpha_mu * mu + (1-alpha_mu)*mu0 - s2_map = alpha_s2*s2+(1-alpha_s2)*s02 + alpha_s2*(1-alpha_mu)*(mu-mu0)**2 + alpha_mu = N / (N + r_mu) + alpha_s2 = N / (N + r_mu) + mu_map = alpha_mu * mu + (1 - alpha_mu) * mu0 + s2_map = ( + alpha_s2 * s2 + + (1 - alpha_s2) * s02 + + alpha_s2 * (1 - alpha_mu) * (mu - mu0) ** 2 + ) return mu_map, np.sqrt(s2_map) def plot_scores_sre18( - key_sre18_eval, scores_sre18_eval, - key_sre19_dev, scores_sre19_dev, - key_sre19_eval, scores_sre19_eval, - name, - output_path): + key_sre18_eval, + scores_sre18_eval, + key_sre19_dev, + scores_sre19_dev, + key_sre19_eval, + scores_sre19_eval, + name, + output_path, +): if not os.path.exists(output_path): os.makedirs(ouput_path) @@ -50,74 +56,105 @@ def plot_scores_sre18( scr_sre18_eval = TrialScores.load_txt(scores_sre18_eval) k_sre19_dev = TrialKey.load_txt(key_sre19_dev) scr_sre19_dev = TrialScores.load_txt(scores_sre19_dev) - #k_sre19_eval = TrialKey.load_txt(key_sre19_eval) + # k_sre19_eval = TrialKey.load_txt(key_sre19_eval) k_sre19_eval = TrialNdx.load_txt(key_sre19_eval) scr_sre19_eval = TrialScores.load_txt(scores_sre19_eval) tar_sre18_eval, non_sre18_eval = scr_sre18_eval.get_tar_non(k_sre18_eval) tar_sre19_dev, non_sre19_dev = scr_sre19_dev.get_tar_non(k_sre19_dev) - #tar_sre19_eval, non_sre19_eval = scr_sre19_eval.get_tar_non(k_sre19_eval) + # tar_sre19_eval, non_sre19_eval = scr_sre19_eval.get_tar_non(k_sre19_eval) scr_sre19_eval = scr_sre19_eval.align_with_ndx(k_sre19_eval) non_sre19_eval = scr_sre19_eval.scores[k_sre19_eval.trial_mask] - p=0.05 - thr=-np.log(p/(1-p)) - - - plt.hist(tar_sre18_eval, 15, histtype='step', density=True, color='g', - linestyle='solid', linewidth=1.5, label='SRE18 eval VAST') - plt.hist(non_sre18_eval, 200, histtype='step', density=True, color='g', - linestyle='solid', linewidth=1.5) - plt.hist(tar_sre19_dev, 5, histtype='step', density=True, color='c', - linestyle='solid', linewidth=1.5, label='SRE19 dev AV') - plt.hist(non_sre19_dev, 100, histtype='step', density=True, color='c', - linestyle='solid', linewidth=1.5) - - - - plt.axvline(x=thr, color='k') - #plt.title(name) - plt.xlabel('LLR score') + p = 0.05 + thr = -np.log(p / (1 - p)) + + plt.hist( + tar_sre18_eval, + 15, + histtype="step", + density=True, + color="g", + linestyle="solid", + linewidth=1.5, + label="SRE18 eval VAST", + ) + plt.hist( + non_sre18_eval, + 200, + histtype="step", + density=True, + color="g", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_sre19_dev, + 5, + histtype="step", + density=True, + color="c", + linestyle="solid", + linewidth=1.5, + label="SRE19 dev AV", + ) + plt.hist( + non_sre19_dev, + 100, + histtype="step", + density=True, + color="c", + linestyle="solid", + linewidth=1.5, + ) + + plt.axvline(x=thr, color="k") + # plt.title(name) + plt.xlabel("LLR score") plt.grid(True) plt.legend() plt.show() - plt.savefig(output_path + '/hist_vid0.png') + plt.savefig(output_path + "/hist_vid0.png") # plt.hist(tar_sre19_eval, 20, histtype='step', density=True, color='k', # linestyle='solid', linewidth=1.5, label='SRE19 eval AV') - plt.hist(non_sre19_eval, 200, histtype='step', density=True, color='k', - linestyle='solid', linewidth=1.5, label='SRE19 eval AV') - - plt.axvline(x=thr, color='k') + plt.hist( + non_sre19_eval, + 200, + histtype="step", + density=True, + color="k", + linestyle="solid", + linewidth=1.5, + label="SRE19 eval AV", + ) + + plt.axvline(x=thr, color="k") plt.grid(True) plt.legend() plt.show() - plt.savefig(output_path + '/hist_vid.png') + plt.savefig(output_path + "/hist_vid.png") plt.clf() - - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Plots hist scores for sre19 av') - - parser.add_argument('--key-sre18-eval', dest='key_sre18_eval', required=True) - parser.add_argument('--scores-sre18-eval', dest='scores_sre18_eval', required=True) - parser.add_argument('--key-sre19-dev', dest='key_sre19_dev', required=True) - parser.add_argument('--scores-sre19-dev', dest='scores_sre19_dev', required=True) - parser.add_argument('--key-sre19-eval', dest='key_sre19_eval', required=True) - parser.add_argument('--scores-sre19-eval', dest='scores_sre19_eval', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--name', dest='name', default='') - - args=parser.parse_args() - - plot_scores_sre18(**vars(args)) + fromfile_prefix_chars="@", + description="Plots hist scores for sre19 av", + ) - + parser.add_argument("--key-sre18-eval", dest="key_sre18_eval", required=True) + parser.add_argument("--scores-sre18-eval", dest="scores_sre18_eval", required=True) + parser.add_argument("--key-sre19-dev", dest="key_sre19_dev", required=True) + parser.add_argument("--scores-sre19-dev", dest="scores_sre19_dev", required=True) + parser.add_argument("--key-sre19-eval", dest="key_sre19_eval", required=True) + parser.add_argument("--scores-sre19-eval", dest="scores_sre19_eval", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--name", dest="name", default="") + + args = parser.parse_args() + + plot_scores_sre18(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/plot_scores_sre19cmn2.py b/egs/sre19-cmn2/v1/local/plot_scores_sre19cmn2.py index c8d10e30..398c8db1 100755 --- a/egs/sre19-cmn2/v1/local/plot_scores_sre19cmn2.py +++ b/egs/sre19-cmn2/v1/local/plot_scores_sre19cmn2.py @@ -1,13 +1,10 @@ -k#!/usr/bin/env python +k #!/usr/bin/env python """ Plot histogram of i-vectors """ - - - import sys import os import argparse @@ -16,7 +13,8 @@ import numpy as np from scipy.stats import mode import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.utils.trial_scores import TrialScores @@ -24,24 +22,32 @@ from hyperion.utils.trial_key import TrialKey -def gauss_map_adapt(mu,sigma,mu0,sigma0,N,r_mu,r_s2): +def gauss_map_adapt(mu, sigma, mu0, sigma0, N, r_mu, r_s2): - s2 = sigma**2 - s02 = sigma0**2 + s2 = sigma ** 2 + s02 = sigma0 ** 2 - alpha_mu = N/(N+r_mu) - alpha_s2 = N/(N+r_mu) - mu_map = alpha_mu * mu + (1-alpha_mu)*mu0 - s2_map = alpha_s2*s2+(1-alpha_s2)*s02 + alpha_s2*(1-alpha_mu)*(mu-mu0)**2 + alpha_mu = N / (N + r_mu) + alpha_s2 = N / (N + r_mu) + mu_map = alpha_mu * mu + (1 - alpha_mu) * mu0 + s2_map = ( + alpha_s2 * s2 + + (1 - alpha_s2) * s02 + + alpha_s2 * (1 - alpha_mu) * (mu - mu0) ** 2 + ) return mu_map, np.sqrt(s2_map) def plot_scores_sre18( - key_sre18_dev, scores_sre18_dev, - key_sre18_eval, scores_sre18_eval, - key_sre19_eval, scores_sre19_eval, - name, - output_path): + key_sre18_dev, + scores_sre18_dev, + key_sre18_eval, + scores_sre18_eval, + key_sre19_eval, + scores_sre19_eval, + name, + output_path, +): if not os.path.exists(output_path): os.makedirs(ouput_path) @@ -53,67 +59,101 @@ def plot_scores_sre18( k_sre19_eval = TrialNdx.load_txt(key_sre19_eval) scr_sre19_eval = TrialScores.load_txt(scores_sre19_eval) - tar_sre18_dev, non_sre18_dev = scr_sre18_dev.get_tar_non(k_sre18_dev) tar_sre18_eval, non_sre18_eval = scr_sre18_eval.get_tar_non(k_sre18_eval) - #tar_sre19_eval, non_sre19_eval = scr_sre19_eval.get_tar_non(k_sre19_eval) + # tar_sre19_eval, non_sre19_eval = scr_sre19_eval.get_tar_non(k_sre19_eval) scr_sre19_eval = scr_sre19_eval.align_with_ndx(k_sre19_eval) non_sre19_eval = scr_sre19_eval.scores[k_sre19_eval.trial_mask] - - p=0.0075 - thr=-np.log(p/(1-p)) - - plt.hist(tar_sre18_dev, 100, histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5, label='SRE18 dev cmn2') - plt.hist(non_sre18_dev, 1000, histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5) - plt.hist(tar_sre18_eval, 100, histtype='step', density=True, color='r', - linestyle='solid', linewidth=1.5, label='SRE18 eval cmn2') - plt.hist(non_sre18_eval, 1000, histtype='step', density=True, color='r', - linestyle='solid', linewidth=1.5) - - plt.axvline(x=thr, color='k') + + p = 0.0075 + thr = -np.log(p / (1 - p)) + + plt.hist( + tar_sre18_dev, + 100, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + label="SRE18 dev cmn2", + ) + plt.hist( + non_sre18_dev, + 1000, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + ) + plt.hist( + tar_sre18_eval, + 100, + histtype="step", + density=True, + color="r", + linestyle="solid", + linewidth=1.5, + label="SRE18 eval cmn2", + ) + plt.hist( + non_sre18_eval, + 1000, + histtype="step", + density=True, + color="r", + linestyle="solid", + linewidth=1.5, + ) + + plt.axvline(x=thr, color="k") plt.title(name) - plt.xlabel('LLR score') + plt.xlabel("LLR score") plt.grid(True) plt.legend() plt.show() - plt.savefig(output_path + '/hist_tel0.png') + plt.savefig(output_path + "/hist_tel0.png") # plt.hist(tar_sre19_eval, 100, histtype='step', density=True, color='g', # linestyle='solid', linewidth=1.5, label='SRE18 eval cmn2') - plt.hist(non_sre19_eval, 1000, histtype='step', density=True, color='g', - linestyle='solid', linewidth=1.5, label='SRE19 eval cmn2') + plt.hist( + non_sre19_eval, + 1000, + histtype="step", + density=True, + color="g", + linestyle="solid", + linewidth=1.5, + label="SRE19 eval cmn2", + ) + + plt.axvline(x=thr, color="k") - plt.axvline(x=thr, color='k') - plt.grid(True) plt.legend() plt.show() - plt.savefig(output_path + '/hist_tel.png') + plt.savefig(output_path + "/hist_tel.png") plt.clf() - - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Plots hist scores for sre19 cmn2') - - parser.add_argument('--key-sre18-dev', dest='key_sre18_dev', required=True) - parser.add_argument('--scores-sre18-dev', dest='scores_sre18_dev', required=True) - parser.add_argument('--key-sre18-eval', dest='key_sre18_eval', required=True) - parser.add_argument('--scores-sre18-eval', dest='scores_sre18_eval', required=True) - parser.add_argument('--key-sre19-eval', dest='key_sre19_eval', required=True) - parser.add_argument('--scores-sre19-eval', dest='scores_sre19_eval', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--name', dest='name', default='') - - args=parser.parse_args() - - plot_scores_sre18(**vars(args)) + fromfile_prefix_chars="@", + description="Plots hist scores for sre19 cmn2", + ) + + parser.add_argument("--key-sre18-dev", dest="key_sre18_dev", required=True) + parser.add_argument("--scores-sre18-dev", dest="scores_sre18_dev", required=True) + parser.add_argument("--key-sre18-eval", dest="key_sre18_eval", required=True) + parser.add_argument("--scores-sre18-eval", dest="scores_sre18_eval", required=True) + parser.add_argument("--key-sre19-eval", dest="key_sre19_eval", required=True) + parser.add_argument("--scores-sre19-eval", dest="scores_sre19_eval", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--name", dest="name", default="") - + args = parser.parse_args() + + plot_scores_sre18(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/rttm2vad.py b/egs/sre19-cmn2/v1/local/rttm2vad.py index 4370077d..a1960411 100644 --- a/egs/sre19-cmn2/v1/local/rttm2vad.py +++ b/egs/sre19-cmn2/v1/local/rttm2vad.py @@ -7,31 +7,33 @@ import numpy as np import pandas as pd -frame_shift=0.01 +frame_shift = 0.01 + def write_vad(f, file_id, vad): - f.write('%s [ ' % (file_id)) + f.write("%s [ " % (file_id)) for i in range(len(vad)): - f.write('%d ' % vad[i]) - f.write(']\n') + f.write("%d " % vad[i]) + f.write("]\n") + - def rttm2vad_file(file_id, rttm, num_frames, fvad, fu2o, min_dur): _, spk_ids = np.unique(rttm.name, return_inverse=True) - num_spks = np.max(spk_ids)+1 + num_spks = np.max(spk_ids) + 1 if len(spk_ids) == 1: vad = np.zeros((num_frames,), dtype=int) - tbeg = np.round(rttm.tbeg/frame_shift).astype('int') - tend = min(np.round((rttm.tbeg+rttm.tdur)/frame_shift).astype('int'), num_frames) - vad[tbeg:tend+1] = 1 - file_dir_id = '%s-d%03d' % (file_id,0) + tbeg = np.round(rttm.tbeg / frame_shift).astype("int") + tend = min( + np.round((rttm.tbeg + rttm.tdur) / frame_shift).astype("int"), num_frames + ) + vad[tbeg : tend + 1] = 1 + file_dir_id = "%s-d%03d" % (file_id, 0) write_vad(fvad, file_dir_id, vad) - fu2o.write('%s %s\n' % (file_dir_id, file_id)) + fu2o.write("%s %s\n" % (file_dir_id, file_id)) return - - + total_dur = np.zeros((num_spks,), dtype=float) for i in range(num_spks): idx = spk_ids == i @@ -42,53 +44,69 @@ def rttm2vad_file(file_id, rttm, num_frames, fvad, fu2o, min_dur): if total_dur[i] >= min_dur or do_all: vad = np.zeros((num_frames,), dtype=int) idx = spk_ids == i - tbeg = np.round(np.array(rttm.tbeg.loc[idx])/frame_shift).astype('int') - tend = np.round(np.array(rttm.tbeg.loc[idx]+rttm.tdur.loc[idx])/frame_shift).astype('int') + tbeg = np.round(np.array(rttm.tbeg.loc[idx]) / frame_shift).astype("int") + tend = np.round( + np.array(rttm.tbeg.loc[idx] + rttm.tdur.loc[idx]) / frame_shift + ).astype("int") for j in range(len(tbeg)): - vad[tbeg[j]:tend[j]+1] = 1 - file_dir_id = '%s-d%03d' % (file_id,i) + vad[tbeg[j] : tend[j] + 1] = 1 + file_dir_id = "%s-d%03d" % (file_id, i) write_vad(fvad, file_dir_id, vad) - fu2o.write('%s %s\n' % (file_dir_id, file_id)) - + fu2o.write("%s %s\n" % (file_dir_id, file_id)) + def rttm2vad(rttm_file, num_frames_file, vad_file, utt2orig, min_dur): - rttm = pd.read_csv(rttm_file, sep='\s+', header=None, - names=['segment_type','file_id','chnl','tbeg','tdur', - 'ortho','stype','name','conf','slat']) + rttm = pd.read_csv( + rttm_file, + sep="\s+", + header=None, + names=[ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ], + ) rttm.index = rttm.file_id - - df_num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, - names=['file_id','num_frames']) + + df_num_frames = pd.read_csv( + num_frames_file, sep="\s+", header=None, names=["file_id", "num_frames"] + ) df_num_frames.index = df_num_frames.file_id + with open(vad_file, "w") as fvad: + with open(utt2orig, "w") as fu2o: - with open(vad_file, 'w') as fvad: - with open(utt2orig, 'w') as fu2o: - for file_id in df_num_frames.file_id: num_frames_i = int(df_num_frames.num_frames.loc[file_id]) print(file_id) rttm_i = rttm.loc[file_id] - file_diars_ids=rttm2vad_file( - file_id, rttm_i, num_frames_i, fvad, fu2o, min_dur) - - + file_diars_ids = rttm2vad_file( + file_id, rttm_i, num_frames_i, fvad, fu2o, min_dur + ) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Converts RTTM to kaldi VAD files') - - parser.add_argument('--rttm',dest='rttm_file', required=True) - parser.add_argument('--num-frames', dest='num_frames_file', required=True) - parser.add_argument('--vad-file', dest='vad_file', required=True) - parser.add_argument('--utt2orig', dest='utt2orig', required=True) - parser.add_argument('--min-dur', dest='min_dur', type=float, default=10) - args=parser.parse_args() - + fromfile_prefix_chars="@", + description="Converts RTTM to kaldi VAD files", + ) + + parser.add_argument("--rttm", dest="rttm_file", required=True) + parser.add_argument("--num-frames", dest="num_frames_file", required=True) + parser.add_argument("--vad-file", dest="vad_file", required=True) + parser.add_argument("--utt2orig", dest="utt2orig", required=True) + parser.add_argument("--min-dur", dest="min_dur", type=float, default=10) + args = parser.parse_args() + rttm2vad(**vars(args)) - diff --git a/egs/sre19-cmn2/v1/local/score_dcf.py b/egs/sre19-cmn2/v1/local/score_dcf.py index ad9f6ab9..deb39682 100755 --- a/egs/sre19-cmn2/v1/local/score_dcf.py +++ b/egs/sre19-cmn2/v1/local/score_dcf.py @@ -5,9 +5,6 @@ """ - - - import sys import os import argparse @@ -24,48 +21,54 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0]) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') - - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/sre19-cmn2/v1/local/sre18_diar_to_vad.py b/egs/sre19-cmn2/v1/local/sre18_diar_to_vad.py index 5acc9081..3ca60092 100755 --- a/egs/sre19-cmn2/v1/local/sre18_diar_to_vad.py +++ b/egs/sre19-cmn2/v1/local/sre18_diar_to_vad.py @@ -5,9 +5,6 @@ # - - - import sys import os import argparse @@ -18,32 +15,40 @@ if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Diarization file to binary vad') - - parser.add_argument(dest='diar_file') - parser.add_argument(dest='num_frames_file') - - args=parser.parse_args() - - utt2num_frames = pd.read_csv(args.num_frames_file, sep=' ', header=None, names=['utt','num_frames', 'None'], index_col=0) - diar = pd.read_csv(args.diar_file, sep=' ', header=None, names=['utt', 'start','end'], index_col=0) - + fromfile_prefix_chars="@", + description="Diarization file to binary vad", + ) + + parser.add_argument(dest="diar_file") + parser.add_argument(dest="num_frames_file") + + args = parser.parse_args() + + utt2num_frames = pd.read_csv( + args.num_frames_file, + sep=" ", + header=None, + names=["utt", "num_frames", "None"], + index_col=0, + ) + diar = pd.read_csv( + args.diar_file, sep=" ", header=None, names=["utt", "start", "end"], index_col=0 + ) for key in utt2num_frames.index.values: - num_frames_i = utt2num_frames['num_frames'][key] + num_frames_i = utt2num_frames["num_frames"][key] vad = np.zeros((num_frames_i,), dtype=int) - start_i = np.array(diar.loc[key]['start'], dtype=int) - end_i = np.array(diar.loc[key]['end'], dtype=int) - if start_i.ndim==0: + start_i = np.array(diar.loc[key]["start"], dtype=int) + end_i = np.array(diar.loc[key]["end"], dtype=int) + if start_i.ndim == 0: start_i = [start_i] end_i = [end_i] - for s,e in zip(start_i,end_i): - if e > num_frames_i-1: - e = num_frames_i-1 - vad[s:e+1] = 1 - - svad = key + ' [ ' + ' '.join([str(v) for v in vad]) + ' ]' + for s, e in zip(start_i, end_i): + if e > num_frames_i - 1: + e = num_frames_i - 1 + vad[s : e + 1] = 1 + + svad = key + " [ " + " ".join([str(v) for v in vad]) + " ]" print(svad) diff --git a/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py index cffa1241..fa16dfce 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-calibration-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -28,45 +25,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - -if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) +if __name__ == "__main__": - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) + + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py index 03958ffc..d3b35fba 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-snorm-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -28,14 +25,22 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, - model_file, score_file, plda_type, - pool_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + model_file, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -45,79 +50,86 @@ def eval_plda(iv_file, ndx_file, enroll_file, x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_Nvs1(x_e, x_coh, method=pool_method, ids1=ids_e) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 telephone condition with S-Norm') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 telephone condition with S-Norm", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py index bf085eba..820c90db 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-tel-be-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -26,14 +23,20 @@ from hyperion.transforms import TransformList - -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - pool_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -43,54 +46,59 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 telephone condition') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 telephone condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py index 88e2a80b..c6f62957 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v1.py @@ -18,14 +18,15 @@ from hyperion.utils.scp_list import SCPList from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers.multi_test_trial_data_reader import MultiTestTrialDataReader as TDR +from hyperion.helpers.multi_test_trial_data_reader import ( + MultiTestTrialDataReader as TDR, +) from hyperion.helpers import PLDAFactory as F from hyperion.transforms import TransformList from hyperion.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR - def combine_diar_scores(ndx, orig_seg, subseg_scores): scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) @@ -33,17 +34,28 @@ def combine_diar_scores(ndx, orig_seg, subseg_scores): idx = orig_seg == ndx.seg_set[j] subseg_scores_j = subseg_scores[:, idx] scores_j = np.max(subseg_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores -def eval_plda(iv_file, ndx_file, enroll_file, test_subseg2orig_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_subseg2orig_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -52,85 +64,89 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_subseg2orig_file, tdr = TDR(iv_file, ndx_file, enroll_file, None, test_subseg2orig_file, preproc) x_e, x_t, enroll, ndx, orig_seg = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info(('total-scoring elapsed time: %.2f s. ' - 'elapsed time per trial: %.2f ms.') - % (dt, dt/num_trials*1000)) + logging.info( + ("total-scoring elapsed time: %.2f s. " "elapsed time per trial: %.2f ms.") + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") scores = combine_diar_scores(ndx, orig_seg, scores) - - logging.info('saving scores to %s' % (score_file)) + + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with diarization in test and AS-Norm') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-subseg2orig-file', dest='test_subseg2orig_file', - required=True) - - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', dest='coh_nbest_discard', - type=int, default=0) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with diarization in test and AS-Norm", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument( + "--test-subseg2orig-file", dest="test_subseg2orig_file", required=True + ) + + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument( + "--coh-nbest-discard", dest="coh_nbest_discard", type=int, default=0 + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py index bd8f3b5a..ec4addef 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-snorm-v2.py @@ -14,17 +14,18 @@ import numpy as np from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.utils import TrialScores +from hyperion.utils import TrialScores from hyperion.helpers import MultiTestTrialDataReaderV2 as TDR from hyperion.helpers import PLDAFactory as F from hyperion.transforms import TransformList from hyperion.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR + def matlist2vec(x): for i in range(len(x)): if x[i].ndim == 1: - x[i] = x[i][None,:] + x[i] = x[i][None, :] return np.concatenate(x, axis=0) @@ -32,20 +33,31 @@ def combine_diar_scores(ndx, orig_seg, subseg_scores): scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) for j in range(len(ndx.seg_set)): - idx = orig_seg == j #ndx.seg_set[j] + idx = orig_seg == j # ndx.seg_set[j] subseg_scores_j = subseg_scores[:, idx] scores_j = np.max(subseg_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores -def eval_plda(enroll_v_file, test_v_file, ndx_file, enroll_file, - preproc_file, - coh_v_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, **kwargs): - - logging.info('loading data') +def eval_plda( + enroll_v_file, + test_v_file, + ndx_file, + enroll_file, + preproc_file, + coh_v_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -54,85 +66,87 @@ def eval_plda(enroll_v_file, test_v_file, ndx_file, enroll_file, tdr = TDR(enroll_v_file, test_v_file, ndx_file, enroll_file, None, preproc) x_e, x_t, enroll, ndx, orig_seg = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_v_file, coh_list, preproc) x_coh = vr.read() if isinstance(x_coh, list): x_coh = matlist2vec(x_coh) t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info(('total-scoring elapsed time: %.2f s. ' - 'elapsed time per trial: %.2f ms.') - % (dt, dt/num_trials*1000)) + logging.info( + ("total-scoring elapsed time: %.2f s. " "elapsed time per trial: %.2f ms.") + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") scores = combine_diar_scores(ndx, orig_seg, scores) - - logging.info('saving scores to %s' % (score_file)) + + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with diarization in test and AS-Norm') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with diarization in test and AS-Norm", + ) - parser.add_argument('--enroll-v-file', required=True) - parser.add_argument('--test-v-file', required=True) - parser.add_argument('--ndx-file', default=None) - parser.add_argument('--enroll-file', required=True) + parser.add_argument("--enroll-v-file", required=True) + parser.add_argument("--test-v-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) - parser.add_argument('--preproc-file', default=None) - parser.add_argument('--coh-v-file', required=True) - parser.add_argument('--coh-list', required=True) - parser.add_argument('--coh-nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', type=int, default=0) + parser.add_argument("--preproc-file", default=None) + parser.add_argument("--coh-v-file", required=True) + parser.add_argument("--coh-list", required=True) + parser.add_argument("--coh-nbest", type=int, default=100) + parser.add_argument("--coh-nbest-discard", type=int, default=0) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py index 23e39382..20e88a37 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v1.py @@ -17,12 +17,13 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -from hyperion.helpers.multi_test_trial_data_reader import MultiTestTrialDataReader as TDR +from hyperion.helpers.multi_test_trial_data_reader import ( + MultiTestTrialDataReader as TDR, +) from hyperion.helpers import PLDAFactory as F from hyperion.transforms import TransformList - def combine_diar_scores(ndx, orig_seg, subseg_scores): scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) @@ -30,18 +31,24 @@ def combine_diar_scores(ndx, orig_seg, subseg_scores): idx = orig_seg == ndx.seg_set[j] subseg_scores_j = subseg_scores[:, idx] scores_j = np.max(subseg_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_subseg2orig_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): -def eval_plda(iv_file, ndx_file, enroll_file, test_subseg2orig_file, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -50,54 +57,57 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_subseg2orig_file, tdr = TDR(iv_file, ndx_file, enroll_file, None, test_subseg2orig_file, preproc) x_e, x_t, enroll, ndx, orig_seg = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - - logging.info('computing llr') + + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") scores = combine_diar_scores(ndx, orig_seg, scores) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with diarization in test') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with diarization in test", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-subseg2orig-file', dest='test_subseg2orig_file', - required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument( + "--test-subseg2orig-file", dest="test_subseg2orig_file", required=True + ) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py index 1c130a63..b77d3595 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-diar-v2.py @@ -24,19 +24,27 @@ def combine_diar_scores(ndx, orig_seg, subseg_scores): scores = np.zeros(ndx.trial_mask.shape, dtype=float_cpu()) for j in range(len(ndx.seg_set)): - idx = orig_seg == j #ndx.seg_set[j] + idx = orig_seg == j # ndx.seg_set[j] subseg_scores_j = subseg_scores[:, idx] scores_j = np.max(subseg_scores_j, axis=1) - scores[:,j] = scores_j + scores[:, j] = scores_j return scores -def eval_plda(enroll_v_file, test_v_file, ndx_file, enroll_file, - preproc_file, model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + enroll_v_file, + test_v_file, + ndx_file, + enroll_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -45,53 +53,55 @@ def eval_plda(enroll_v_file, test_v_file, ndx_file, enroll_file, tdr = TDR(enroll_v_file, test_v_file, ndx_file, enroll_file, None, preproc) x_e, x_t, enroll, ndx, orig_seg = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - - logging.info('computing llr') + + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('combine cluster scores') + logging.info("combine cluster scores") scores = combine_diar_scores(ndx, orig_seg, scores) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s = s.align_with_ndx(ndx) s.save_txt(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with diarization in test') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with diarization in test", + ) - parser.add_argument('--enroll-v-file', required=True) - parser.add_argument('--test-v-file', required=True) - parser.add_argument('--ndx-file', required=True) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--preproc-file', default=None) + parser.add_argument("--enroll-v-file", required=True) + parser.add_argument("--test-v-file", required=True) + parser.add_argument("--ndx-file", required=True) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--preproc-file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py index a782377d..0c5b31e0 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-snorm-v1.py @@ -24,13 +24,23 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,79 +48,84 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - - logging.info('loading plda model: %s' % (model_file)) + + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) - - logging.info('loading cohort data') + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 Video condition with S-Norm') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', dest='coh_nbest_discard', type=int, default=0) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 Video condition with S-Norm", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument( + "--coh-nbest-discard", dest="coh_nbest_discard", type=int, default=0 + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py index 63e3a83b..f7d83d30 100755 --- a/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/eval-vid-be-v1.py @@ -22,12 +22,19 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -36,50 +43,51 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 Video condition') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 Video condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py b/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py index 23db5935..779e62af 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-calibration-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -27,64 +24,71 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - logging.info('train calibration') + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) - lr = LR(prior=prior, lambda_reg=lambda_reg, bias_scaling=1, solver='liblinear', verbose=verbose) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('--lambda-reg', dest='lambda_reg', type=float, - default=1e-5) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py index 920d2171..c9f22d83 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v1.py @@ -18,14 +18,29 @@ from hyperion.utils.utt2info import Utt2Info -def train_be(iv_file, train_list, - adapt_iv_file, adapt_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - w_mu1, w_B1, w_W1, - w_mu2, w_B2, w_W2, num_spks, do_ahc, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + w_mu1, + w_B1, + w_W1, + w_mu2, + w_B2, + w_W2, + num_spks, + do_ahc, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) @@ -35,29 +50,27 @@ def train_be(iv_file, train_list, # Train LDA t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - print('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - print('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + print("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - print('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models preproc = TransformList(lda) @@ -66,13 +79,13 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Compute mean for adapted data vcr = VCR(adapt_iv_file, adapt_list, None) x, class_ids = vcr.read() @@ -83,78 +96,69 @@ def train_be(iv_file, train_list, preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") + x_ln = lnorm.predict(x_lda) plda_adapt1 = plda.copy() plda_adapt2 = plda.copy() - + elbo = plda.fit(x_ln, class_ids, epochs=20) plda_adapt1.weighted_avg_model(plda, w_mu1, w_B1, w_W1) - plda_adapt1.save(output_path + '/plda_adapt1.h5') + plda_adapt1.save(output_path + "/plda_adapt1.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt1.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt1.csv", elbo, delimiter=",") if not do_ahc: return - + scores = plda_adapt1.llr_1vs1(x_ln, x_ln) - ahc = AHC(method='average', metric='llr') + ahc = AHC(method="average", metric="llr") ahc.fit(scores) - class_ids2 = ahc.get_flat_clusters(num_spks, criterion='num_clusters') + class_ids2 = ahc.get_flat_clusters(num_spks, criterion="num_clusters") elbo = plda_adapt1.fit(x_ln, class_ids2, epochs=20) plda_adapt2.weighted_avg_model(plda_adapt1, w_mu2, w_B2, w_W2) - plda_adapt2.save(output_path + '/plda_adapt2.h5') - + plda_adapt2.save(output_path + "/plda_adapt2.h5") + num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt2.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt2.csv", elbo, delimiter=",") + + u2c_out = Utt2Info.create(vcr.u2c.key, class_ids2.astype("U")) + u2c_out.save(output_path + "/output_adapt_spk2utt.scp", sep=" ") + - u2c_out = Utt2Info.create(vcr.u2c.key, class_ids2.astype('U')) - u2c_out.save(output_path + '/output_adapt_spk2utt.scp', sep=' ') - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 telephone condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) - parser.add_argument('--do-ahc', dest='do_ahc', default=False, action='store_true') - + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 telephone condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) + parser.add_argument("--do-ahc", dest="do_ahc", default=False, action="store_true") + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - parser.add_argument('--w-mu1', dest='w_mu1', type=float, - default=1) - parser.add_argument('--w-b1', dest='w_B1', type=float, - default=1) - parser.add_argument('--w-w1', dest='w_W1', type=float, - default=1) - parser.add_argument('--w-mu2', dest='w_mu2', type=float, - default=1) - parser.add_argument('--w-b2', dest='w_B2', type=float, - default=1) - parser.add_argument('--w-w2', dest='w_W2', type=float, - default=1) - parser.add_argument('--num-spks', dest='num_spks', type=int, - default=1000) - - args=parser.parse_args() - - train_be(**vars(args)) - + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--w-mu1", dest="w_mu1", type=float, default=1) + parser.add_argument("--w-b1", dest="w_B1", type=float, default=1) + parser.add_argument("--w-w1", dest="w_W1", type=float, default=1) + parser.add_argument("--w-mu2", dest="w_mu2", type=float, default=1) + parser.add_argument("--w-b2", dest="w_B2", type=float, default=1) + parser.add_argument("--w-w2", dest="w_W2", type=float, default=1) + parser.add_argument("--num-spks", dest="num_spks", type=int, default=1000) + + args = parser.parse_args() + + train_be(**vars(args)) diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py index 08310485..d8d82405 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v2.py @@ -4,9 +4,6 @@ """ - - - import sys import os import argparse @@ -22,15 +19,31 @@ from hyperion.utils.utt2info import Utt2Info -def train_be(iv_file, train_list, - adapt_iv_file, adapt_list, - unlab_adapt_iv_file, unlab_adapt_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - w_mu1, w_B1, w_W1, - w_mu2, w_B2, w_W2, num_spks_unlab, do_ahc, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + unlab_adapt_iv_file, + unlab_adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + w_mu1, + w_B1, + w_W1, + w_mu2, + w_B2, + w_W2, + num_spks_unlab, + do_ahc, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) @@ -40,29 +53,27 @@ def train_be(iv_file, train_list, # Train LDA t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - print('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - print('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + print("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - print('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models preproc = TransformList(lda) @@ -71,13 +82,13 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Load labeled adapt data vcr = VCR(adapt_iv_file, adapt_list, None) x_adapt, class_ids_adapt = vcr.read() @@ -95,31 +106,31 @@ def train_be(iv_file, train_list, preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") + x_adapt_ln = lnorm.predict(x_adapt_lda) x_unlab_ln = lnorm.predict(x_unlab_lda) plda_adapt1 = plda.copy() - if np.max(class_ids_adapt)+1 < plda.y_dim: + if np.max(class_ids_adapt) + 1 < plda.y_dim: plda.update_V = False elbo = plda.fit(x_adapt_ln, class_ids_adapt, epochs=20) plda_adapt1.weighted_avg_model(plda, w_mu1, w_B1, w_W1) - plda_adapt1.save(output_path + '/plda_adapt1.h5') + plda_adapt1.save(output_path + "/plda_adapt1.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt1.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt1.csv", elbo, delimiter=",") if not do_ahc: return - + scores = plda_adapt1.llr_1vs1(x_unlab_ln, x_unlab_ln) - ahc = AHC(method='average', metric='llr') + ahc = AHC(method="average", metric="llr") ahc.fit(scores) - class_ids_ahc = ahc.get_flat_clusters(num_spks_unlab, criterion='num_clusters') + class_ids_ahc = ahc.get_flat_clusters(num_spks_unlab, criterion="num_clusters") x_adapt2_ln = np.concatenate((x_adapt_ln, x_unlab_ln), axis=0) class_ids_adapt2 = np.concatenate((class_ids_adapt, class_ids_ahc)) @@ -127,54 +138,49 @@ def train_be(iv_file, train_list, plda_adapt2 = plda_adapt1.copy() elbo = plda_adapt1.fit(x_adapt2_ln, class_ids_adapt2, epochs=20) plda_adapt2.weighted_avg_model(plda_adapt1, w_mu2, w_B2, w_W2) - plda_adapt2.save(output_path + '/plda_adapt2.h5') - + plda_adapt2.save(output_path + "/plda_adapt2.h5") + num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt2.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt2.csv", elbo, delimiter=",") + + u2c_out = Utt2Info.create(vcr.u2c.key, class_ids_ahc.astype("U")) + u2c_out.save(output_path + "/output_adapt_spk2utt.scp", sep=" ") + - u2c_out = Utt2Info.create(vcr.u2c.key, class_ids_ahc.astype('U')) - u2c_out.save(output_path + '/output_adapt_spk2utt.scp', sep=' ') - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE19 telephone condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) - parser.add_argument('--unlab-adapt-iv-file', dest='unlab_adapt_iv_file', required=True) - parser.add_argument('--unlab-adapt-list', dest='unlab_adapt_list', required=True) - parser.add_argument('--do-ahc', dest='do_ahc', default=False, action='store_true') - + fromfile_prefix_chars="@", + description="Train Back-end for SRE19 telephone condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) + parser.add_argument( + "--unlab-adapt-iv-file", dest="unlab_adapt_iv_file", required=True + ) + parser.add_argument("--unlab-adapt-list", dest="unlab_adapt_list", required=True) + parser.add_argument("--do-ahc", dest="do_ahc", default=False, action="store_true") + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - parser.add_argument('--w-mu1', dest='w_mu1', type=float, - default=1) - parser.add_argument('--w-b1', dest='w_B1', type=float, - default=1) - parser.add_argument('--w-w1', dest='w_W1', type=float, - default=1) - parser.add_argument('--w-mu2', dest='w_mu2', type=float, - default=1) - parser.add_argument('--w-b2', dest='w_B2', type=float, - default=1) - parser.add_argument('--w-w2', dest='w_W2', type=float, - default=1) - parser.add_argument('--num-spks-unlab', dest='num_spks_unlab', type=int, - default=1000) - - args=parser.parse_args() - - train_be(**vars(args)) - + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--w-mu1", dest="w_mu1", type=float, default=1) + parser.add_argument("--w-b1", dest="w_B1", type=float, default=1) + parser.add_argument("--w-w1", dest="w_W1", type=float, default=1) + parser.add_argument("--w-mu2", dest="w_mu2", type=float, default=1) + parser.add_argument("--w-b2", dest="w_B2", type=float, default=1) + parser.add_argument("--w-w2", dest="w_W2", type=float, default=1) + parser.add_argument( + "--num-spks-unlab", dest="num_spks_unlab", type=int, default=1000 + ) + + args = parser.parse_args() + + train_be(**vars(args)) diff --git a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py index 43ffa041..1b039c40 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py +++ b/egs/sre19-cmn2/v1/steps_be/train-tel-be-v3.py @@ -18,18 +18,35 @@ from hyperion.utils.utt2info import Utt2Info -def train_be(iv_file, train_list, - adapt_iv_file, adapt_list, - unlab_adapt_iv_file, unlab_adapt_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - w_mu1, w_B1, w_W1, - w_mu2, w_B2, w_W2, num_spks_unlab, do_ahc, - w_coral_mu, w_coral_T, - output_path, **kwargs): - - # Read train data +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + unlab_adapt_iv_file, + unlab_adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + w_mu1, + w_B1, + w_W1, + w_mu2, + w_B2, + w_W2, + num_spks_unlab, + do_ahc, + w_coral_mu, + w_coral_T, + output_path, + **kwargs +): + + # Read train data vcr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr.read() @@ -57,7 +74,7 @@ def train_be(iv_file, train_list, # Train LDA x_lab_tot = np.concatenate((x_coral, x_adapt), axis=0) class_ids_lab_tot = np.concatenate((class_ids, class_ids_adapt)) - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x_lab_tot, class_ids_lab_tot) del x_lab_tot @@ -66,12 +83,12 @@ def train_be(iv_file, train_list, x_adapt_lda = lda.predict(x_adapt) x_unlab_lda = lda.predict(x_unlab) - print('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() x_lda_all = np.concatenate((x_lda, x_adapt_lda, x_unlab_lda), axis=0) - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda_all) del x_lda_all @@ -88,20 +105,20 @@ def train_be(iv_file, train_list, # Apply lnorm to in-domain x_adapt_ln = lnorm_in.predict(x_adapt_lda) x_unlab_ln = lnorm_in.predict(x_unlab_lda) - print('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + print("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() x_lab_ln = np.concatenate((x_ln, x_adapt_ln), axis=0) class_ids_lab_tot = np.concatenate((class_ids, class_ids_adapt)) - #x_lab_ln = x_ln - #class_ids_lab_tot = class_ids - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_lab_ln, class_ids_lab_tot, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + # x_lab_ln = x_ln + # class_ids_lab_tot = class_ids + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit( + x_lab_ln, class_ids_lab_tot, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs + ) - print('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + print("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models preproc = TransformList(lda) @@ -110,38 +127,38 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") preproc = TransformList(lda) preproc.append(lnorm_in) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") + plda_adapt1 = plda.copy() - if np.max(class_ids_adapt)+1 < plda.y_dim: + if np.max(class_ids_adapt) + 1 < plda.y_dim: plda.update_V = False elbo = plda.fit(x_adapt_ln, class_ids_adapt, epochs=20) plda_adapt1.weighted_avg_model(plda, w_mu1, w_B1, w_W1) - plda_adapt1.save(output_path + '/plda_adapt1.h5') + plda_adapt1.save(output_path + "/plda_adapt1.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt1.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt1.csv", elbo, delimiter=",") if not do_ahc: return - + scores = plda_adapt1.llr_1vs1(x_unlab_ln, x_unlab_ln) - ahc = AHC(method='average', metric='llr') + ahc = AHC(method="average", metric="llr") ahc.fit(scores) - class_ids_ahc = ahc.get_flat_clusters(num_spks_unlab, criterion='num_clusters') + class_ids_ahc = ahc.get_flat_clusters(num_spks_unlab, criterion="num_clusters") x_adapt2_ln = np.concatenate((x_adapt_ln, x_unlab_ln), axis=0) class_ids_adapt2 = np.concatenate((class_ids_adapt, class_ids_ahc)) @@ -149,58 +166,51 @@ def train_be(iv_file, train_list, plda_adapt2 = plda_adapt1.copy() elbo = plda_adapt1.fit(x_adapt2_ln, class_ids_adapt2, epochs=20) plda_adapt2.weighted_avg_model(plda_adapt1, w_mu2, w_B2, w_W2) - plda_adapt2.save(output_path + '/plda_adapt2.h5') - + plda_adapt2.save(output_path + "/plda_adapt2.h5") + num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt2.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt2.csv", elbo, delimiter=",") + + u2c_out = Utt2Info.create(vcr.u2c.key, class_ids_ahc.astype("U")) + u2c_out.save(output_path + "/output_adapt_spk2utt.scp", sep=" ") + - u2c_out = Utt2Info.create(vcr.u2c.key, class_ids_ahc.astype('U')) - u2c_out.save(output_path + '/output_adapt_spk2utt.scp', sep=' ') - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE19 telephone condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) - parser.add_argument('--unlab-adapt-iv-file', dest='unlab_adapt_iv_file', required=True) - parser.add_argument('--unlab-adapt-list', dest='unlab_adapt_list', required=True) - parser.add_argument('--do-ahc', dest='do_ahc', default=False, action='store_true') - + fromfile_prefix_chars="@", + description="Train Back-end for SRE19 telephone condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) + parser.add_argument( + "--unlab-adapt-iv-file", dest="unlab_adapt_iv_file", required=True + ) + parser.add_argument("--unlab-adapt-list", dest="unlab_adapt_list", required=True) + parser.add_argument("--do-ahc", dest="do_ahc", default=False, action="store_true") + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - parser.add_argument('--w-coral-mu', dest='w_coral_mu', type=float, - default=0.5) - parser.add_argument('--w-coral-t', dest='w_coral_T', type=float, - default=0.75) - parser.add_argument('--w-mu1', dest='w_mu1', type=float, - default=1) - parser.add_argument('--w-b1', dest='w_B1', type=float, - default=1) - parser.add_argument('--w-w1', dest='w_W1', type=float, - default=1) - parser.add_argument('--w-mu2', dest='w_mu2', type=float, - default=1) - parser.add_argument('--w-b2', dest='w_B2', type=float, - default=1) - parser.add_argument('--w-w2', dest='w_W2', type=float, - default=1) - parser.add_argument('--num-spks-unlab', dest='num_spks_unlab', type=int, - default=1000) - - args=parser.parse_args() - - train_be(**vars(args)) - + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--w-coral-mu", dest="w_coral_mu", type=float, default=0.5) + parser.add_argument("--w-coral-t", dest="w_coral_T", type=float, default=0.75) + parser.add_argument("--w-mu1", dest="w_mu1", type=float, default=1) + parser.add_argument("--w-b1", dest="w_B1", type=float, default=1) + parser.add_argument("--w-w1", dest="w_W1", type=float, default=1) + parser.add_argument("--w-mu2", dest="w_mu2", type=float, default=1) + parser.add_argument("--w-b2", dest="w_B2", type=float, default=1) + parser.add_argument("--w-w2", dest="w_W2", type=float, default=1) + parser.add_argument( + "--num-spks-unlab", dest="num_spks_unlab", type=int, default=1000 + ) + + args = parser.parse_args() + + train_be(**vars(args)) diff --git a/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py b/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py index bcdc7b91..f825d59b 100755 --- a/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py +++ b/egs/sre19-cmn2/v1/steps_be/train-vid-be-v1.py @@ -21,53 +21,62 @@ from hyperion.helpers import PLDAFactory as F from hyperion.utils.scp_list import SCPList + def matlist2vec(x): for i in range(len(x)): if x[i].ndim == 1: - x[i] = x[i][None,:] + x[i] = x[i][None, :] return np.concatenate(x, axis=0) -def train_be(iv_file, train_list, - adapt_iv_file_1, adapt_list_1, - adapt_iv_file_2, adapt_list_2, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, r2, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file_1, + adapt_list_1, + adapt_iv_file_2, + adapt_list_2, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + r2, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr_train.read() - # Train LDA t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models preproc = TransformList(lda) @@ -76,33 +85,32 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Compute mean for adapted data t1 = time.time() vr = VR(adapt_iv_file_1, adapt_list_1, None) x = vr.read() if isinstance(x, list): x = matlist2vec(x) - + x = lda.predict(x) lnorm.update_T = False lnorm.fit(x) - + preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") # Compute mean for adapted data 2 if adapt_list_2 is None: return - + vr = VR(adapt_iv_file_2, adapt_list_2, None) x = vr.read() if isinstance(x, list): @@ -110,43 +118,42 @@ def train_be(iv_file, train_list, x = lda.predict(x) N = x.shape[0] - alpha = N/(N+r2) - lnorm.mu = alpha*np.mean(x, axis=0) + (1-alpha)*lnorm.mu + alpha = N / (N + r2) + lnorm.mu = alpha * np.mean(x, axis=0) + (1 - alpha) * lnorm.mu preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt2.h5') - logging.info('Adapt Elapsed time: %.2f s.' % (time.time()-t1)) + preproc.save(output_path + "/lda_lnorm_adapt2.h5") + logging.info("Adapt Elapsed time: %.2f s." % (time.time() - t1)) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 video condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file-1', dest='adapt_iv_file_1', required=True) - parser.add_argument('--adapt-list-1', dest='adapt_list_1', required=True) - parser.add_argument('--adapt-iv-file-2', dest='adapt_iv_file_2', default=None) - parser.add_argument('--adapt-list-2', dest='adapt_list_2', default=None) - parser.add_argument('--r-2', dest='r2', default=14, type=float) - + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 video condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file-1", dest="adapt_iv_file_1", required=True) + parser.add_argument("--adapt-list-1", dest="adapt_list_1", required=True) + parser.add_argument("--adapt-iv-file-2", dest="adapt_iv_file_2", default=None) + parser.add_argument("--adapt-list-2", dest="adapt_list_2", default=None) + parser.add_argument("--r-2", dest="r2", default=14, type=float) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) train_be(**vars(args)) - - diff --git a/egs/sre19-cmn2/v1/steps_kaldi_diar/make_rttm.py b/egs/sre19-cmn2/v1/steps_kaldi_diar/make_rttm.py index cc1145ab..ace8f8e6 100755 --- a/egs/sre19-cmn2/v1/steps_kaldi_diar/make_rttm.py +++ b/egs/sre19-cmn2/v1/steps_kaldi_diar/make_rttm.py @@ -35,96 +35,112 @@ import argparse import sys -sys.path.append('steps/libs') +sys.path.append("steps/libs") import common as common_lib def get_args(): - parser = argparse.ArgumentParser( - description="""This script converts a segments and labels file + parser = argparse.ArgumentParser( + description="""This script converts a segments and labels file to a NIST RTTM file. It handles overlapping segments (e.g. the - output of a sliding-window diarization system).""") + output of a sliding-window diarization system).""" + ) - parser.add_argument("segments", type=str, - help="Input segments file") - parser.add_argument("labels", type=str, - help="Input labels file") - parser.add_argument("rttm_file", type=str, - help="Output RTTM file") - parser.add_argument("--rttm-channel", type=int, default=0, - help="The value passed into the RTTM channel field. \ - Only affects the format of the RTTM file.") + parser.add_argument("segments", type=str, help="Input segments file") + parser.add_argument("labels", type=str, help="Input labels file") + parser.add_argument("rttm_file", type=str, help="Output RTTM file") + parser.add_argument( + "--rttm-channel", + type=int, + default=0, + help="The value passed into the RTTM channel field. \ + Only affects the format of the RTTM file.", + ) + + args = parser.parse_args() + return args - args = parser.parse_args() - return args def main(): - args = get_args() - - # File containing speaker labels per segment - seg2label = {} - with common_lib.smart_open(args.labels) as labels_file: - for line in labels_file: - seg, label = line.strip().split() - seg2label[seg] = label - - # Segments file - reco2segs = {} - with common_lib.smart_open(args.segments) as segments_file: - for line in segments_file: - seg, reco, start, end = line.strip().split() - try: - if reco in reco2segs: - reco2segs[reco] = reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg] - else: - reco2segs[reco] = reco + " " + start + "," + end + "," + seg2label[seg] - except KeyError: - raise RuntimeError("Missing label for segment {0}".format(seg)) - - # Cut up overlapping segments so they are contiguous - contiguous_segs = [] - for reco in sorted(reco2segs): - segs = reco2segs[reco].strip().split() - new_segs = "" - for i in range(1, len(segs)-1): - start, end, label = segs[i].split(',') - next_start, next_end, next_label = segs[i+1].split(',') - if float(end) > float(next_start): - done = False - avg = str((float(next_start) + float(end)) / 2.0) - segs[i+1] = ','.join([avg, next_end, next_label]) - new_segs += " " + start + "," + avg + "," + label - else: + args = get_args() + + # File containing speaker labels per segment + seg2label = {} + with common_lib.smart_open(args.labels) as labels_file: + for line in labels_file: + seg, label = line.strip().split() + seg2label[seg] = label + + # Segments file + reco2segs = {} + with common_lib.smart_open(args.segments) as segments_file: + for line in segments_file: + seg, reco, start, end = line.strip().split() + try: + if reco in reco2segs: + reco2segs[reco] = ( + reco2segs[reco] + " " + start + "," + end + "," + seg2label[seg] + ) + else: + reco2segs[reco] = ( + reco + " " + start + "," + end + "," + seg2label[seg] + ) + except KeyError: + raise RuntimeError("Missing label for segment {0}".format(seg)) + + # Cut up overlapping segments so they are contiguous + contiguous_segs = [] + for reco in sorted(reco2segs): + segs = reco2segs[reco].strip().split() + new_segs = "" + for i in range(1, len(segs) - 1): + start, end, label = segs[i].split(",") + next_start, next_end, next_label = segs[i + 1].split(",") + if float(end) > float(next_start): + done = False + avg = str((float(next_start) + float(end)) / 2.0) + segs[i + 1] = ",".join([avg, next_end, next_label]) + new_segs += " " + start + "," + avg + "," + label + else: + new_segs += " " + start + "," + end + "," + label + start, end, label = segs[-1].split(",") new_segs += " " + start + "," + end + "," + label - start, end, label = segs[-1].split(',') - new_segs += " " + start + "," + end + "," + label - contiguous_segs.append(reco + new_segs) - - # Merge contiguous segments of the same label - merged_segs = [] - for reco_line in contiguous_segs: - segs = reco_line.strip().split() - reco = segs[0] - new_segs = "" - for i in range(1, len(segs)-1): - start, end, label = segs[i].split(',') - next_start, next_end, next_label = segs[i+1].split(',') - if float(end) == float(next_start) and label == next_label: - segs[i+1] = ','.join([start, next_end, next_label]) - else: + contiguous_segs.append(reco + new_segs) + + # Merge contiguous segments of the same label + merged_segs = [] + for reco_line in contiguous_segs: + segs = reco_line.strip().split() + reco = segs[0] + new_segs = "" + for i in range(1, len(segs) - 1): + start, end, label = segs[i].split(",") + next_start, next_end, next_label = segs[i + 1].split(",") + if float(end) == float(next_start) and label == next_label: + segs[i + 1] = ",".join([start, next_end, next_label]) + else: + new_segs += " " + start + "," + end + "," + label + start, end, label = segs[-1].split(",") new_segs += " " + start + "," + end + "," + label - start, end, label = segs[-1].split(',') - new_segs += " " + start + "," + end + "," + label - merged_segs.append(reco + new_segs) - - with common_lib.smart_open(args.rttm_file, 'w') as rttm_writer: - for reco_line in merged_segs: - segs = reco_line.strip().split() - reco = segs[0] - for i in range(1, len(segs)): - start, end, label = segs[i].strip().split(',') - print("SPEAKER {0} {1} {2:7.3f} {3:7.3f} {4} ".format( - reco, args.rttm_channel, float(start), float(end)-float(start), label), file=rttm_writer) - -if __name__ == '__main__': - main() + merged_segs.append(reco + new_segs) + + with common_lib.smart_open(args.rttm_file, "w") as rttm_writer: + for reco_line in merged_segs: + segs = reco_line.strip().split() + reco = segs[0] + for i in range(1, len(segs)): + start, end, label = segs[i].strip().split(",") + print( + "SPEAKER {0} {1} {2:7.3f} {3:7.3f} {4} ".format( + reco, + args.rttm_channel, + float(start), + float(end) - float(start), + label, + ), + file=rttm_writer, + ) + + +if __name__ == "__main__": + main() diff --git a/egs/sre19-cmn2/v1/steps_kaldi_xvec/allocate_egs.py b/egs/sre19-cmn2/v1/steps_kaldi_xvec/allocate_egs.py index 72a4572d..e4b58c68 100755 --- a/egs/sre19-cmn2/v1/steps_kaldi_xvec/allocate_egs.py +++ b/egs/sre19-cmn2/v1/steps_kaldi_xvec/allocate_egs.py @@ -67,51 +67,97 @@ from __future__ import print_function import re, os, argparse, sys, math, warnings, random + def get_args(): - parser = argparse.ArgumentParser(description="Writes ranges.*, outputs.* and archive_chunk_lengths files " - "in preparation for dumping egs for xvector training.", - epilog="Called by sid/nnet3/xvector/get_egs.sh") - parser.add_argument("--prefix", type=str, default="", - help="Adds a prefix to the output files. This is used to distinguish between the train " - "and diagnostic files.") - parser.add_argument("--num-repeats", type=int, default=10, help="Number of times each speaker repeats within an archive.") - parser.add_argument("--min-frames-per-chunk", type=int, default=50, - help="Minimum number of frames-per-chunk used for any archive") - parser.add_argument("--max-frames-per-chunk", type=int, default=300, - help="Maximum number of frames-per-chunk used for any archive") - parser.add_argument("--randomize-chunk-length", type=str, - help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]." - "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk" - "according to a geometric sequence.", - default="true", choices = ["false", "true"]) - parser.add_argument("--frames-per-iter", type=int, default=1000000, - help="Target number of frames for each archive") - parser.add_argument("--num-archives", type=int, default=-1, - help="Number of archives to write"); - parser.add_argument("--num-jobs", type=int, default=-1, - help="Number of jobs we're going to use to write the archives; the ranges.* " - "and outputs.* files are indexed by job. Must be <= the --num-archives option."); - parser.add_argument("--seed", type=int, default=123, - help="Seed for random number generator") - parser.add_argument("--num-pdfs", type=int, default=-1, - help="Num pdfs") + parser = argparse.ArgumentParser( + description="Writes ranges.*, outputs.* and archive_chunk_lengths files " + "in preparation for dumping egs for xvector training.", + epilog="Called by sid/nnet3/xvector/get_egs.sh", + ) + parser.add_argument( + "--prefix", + type=str, + default="", + help="Adds a prefix to the output files. This is used to distinguish between the train " + "and diagnostic files.", + ) + parser.add_argument( + "--num-repeats", + type=int, + default=10, + help="Number of times each speaker repeats within an archive.", + ) + parser.add_argument( + "--min-frames-per-chunk", + type=int, + default=50, + help="Minimum number of frames-per-chunk used for any archive", + ) + parser.add_argument( + "--max-frames-per-chunk", + type=int, + default=300, + help="Maximum number of frames-per-chunk used for any archive", + ) + parser.add_argument( + "--randomize-chunk-length", + type=str, + help="If true, randomly pick a chunk length in [min-frames-per-chunk, max-frames-per-chunk]." + "If false, the chunk length varies from min-frames-per-chunk to max-frames-per-chunk" + "according to a geometric sequence.", + default="true", + choices=["false", "true"], + ) + parser.add_argument( + "--frames-per-iter", + type=int, + default=1000000, + help="Target number of frames for each archive", + ) + parser.add_argument( + "--num-archives", type=int, default=-1, help="Number of archives to write" + ) + parser.add_argument( + "--num-jobs", + type=int, + default=-1, + help="Number of jobs we're going to use to write the archives; the ranges.* " + "and outputs.* files are indexed by job. Must be <= the --num-archives option.", + ) + parser.add_argument( + "--seed", type=int, default=123, help="Seed for random number generator" + ) + parser.add_argument("--num-pdfs", type=int, default=-1, help="Num pdfs") # now the positional arguments - parser.add_argument("--utt2len-filename", type=str, required=True, - help="utt2len file of the features to be used as input (format is: " - " )"); - parser.add_argument("--utt2int-filename", type=str, required=True, - help="utt2int file of the features to be used as input (format is: " - " )"); - parser.add_argument("--egs-dir", type=str, required=True, - help="Name of egs directory, e.g. exp/xvector_a/egs"); - - print(' '.join(sys.argv), file=sys.stderr) + parser.add_argument( + "--utt2len-filename", + type=str, + required=True, + help="utt2len file of the features to be used as input (format is: " + " )", + ) + parser.add_argument( + "--utt2int-filename", + type=str, + required=True, + help="utt2int file of the features to be used as input (format is: " + " )", + ) + parser.add_argument( + "--egs-dir", + type=str, + required=True, + help="Name of egs directory, e.g. exp/xvector_a/egs", + ) + + print(" ".join(sys.argv), file=sys.stderr) print(sys.argv, file=sys.stderr) args = parser.parse_args() args = process_args(args) return args + def process_args(args): if args.num_repeats < 1: raise Exception("--num-repeats should have a minimum value of 1") @@ -131,6 +177,7 @@ def process_args(args): raise Exception("--num-jobs is invalid (must not exceed num-archives)") return args + # Create utt2len def get_utt2len(utt2len_filename): utt2len = {} @@ -148,6 +195,7 @@ def get_utt2len(utt2len_filename): return utt2len # Done utt2len + # Handle utt2int, create spk2utt, spks def get_labels(utt2int_filename): f = open(utt2int_filename, "r") @@ -177,29 +225,37 @@ def get_labels(utt2int_filename): def get_random_utt(spkr, spk2utt, min_length): this_utts = spk2utt[spkr] this_num_utts = len(this_utts) - i = random.randint(0, this_num_utts-1) + i = random.randint(0, this_num_utts - 1) utt = this_utts[i] return utt + def random_chunk_length(min_frames_per_chunk, max_frames_per_chunk): ans = random.randint(min_frames_per_chunk, max_frames_per_chunk) return ans + # This function returns an integer in the range # [min-frames-per-chunk, max-frames-per-chunk] according to a geometric # sequence. For example, suppose min-frames-per-chunk is 50, # max-frames-per-chunk is 200, and args.num_archives is 3. Then the # lengths for archives 0, 1, and 2 will be 50, 100, and 200. -def deterministic_chunk_length(archive_id, num_archives, min_frames_per_chunk, max_frames_per_chunk): - if max_frames_per_chunk == min_frames_per_chunk: - return max_frames_per_chunk - elif num_archives == 1: - return int(max_frames_per_chunk); - else: - return int(math.pow(float(max_frames_per_chunk) / - min_frames_per_chunk, float(archive_id) / - (num_archives-1)) * min_frames_per_chunk + 0.5) - +def deterministic_chunk_length( + archive_id, num_archives, min_frames_per_chunk, max_frames_per_chunk +): + if max_frames_per_chunk == min_frames_per_chunk: + return max_frames_per_chunk + elif num_archives == 1: + return int(max_frames_per_chunk) + else: + return int( + math.pow( + float(max_frames_per_chunk) / min_frames_per_chunk, + float(archive_id) / (num_archives - 1), + ) + * min_frames_per_chunk + + 0.5 + ) # given an utterance length utt_length (in frames) and two desired chunk lengths @@ -229,7 +285,7 @@ def main(): # frames in examples of that archive. archive_chunk_lengths = [] # all_egs contains 2-tuples of the form (utt-id, offset) - all_egs= [] + all_egs = [] prefix = "" if args.prefix != "": @@ -237,18 +293,29 @@ def main(): info_f = open(args.egs_dir + "/temp/" + prefix + "archive_chunk_lengths", "w") if info_f is None: - sys.exit(str("Error opening file {0}/temp/" + prefix + "archive_chunk_lengths").format(args.egs_dir)); + sys.exit( + str( + "Error opening file {0}/temp/" + prefix + "archive_chunk_lengths" + ).format(args.egs_dir) + ) for archive_index in range(args.num_archives): print("Processing archive {0}".format(archive_index + 1)) if args.randomize_chunk_length == "true": # don't constrain the lengths to be the same - length = random_chunk_length(args.min_frames_per_chunk, args.max_frames_per_chunk) + length = random_chunk_length( + args.min_frames_per_chunk, args.max_frames_per_chunk + ) else: - length = deterministic_chunk_length(archive_index, args.num_archives, args.min_frames_per_chunk, args.max_frames_per_chunk); + length = deterministic_chunk_length( + archive_index, + args.num_archives, + args.min_frames_per_chunk, + args.max_frames_per_chunk, + ) print("{0} {1}".format(archive_index + 1, length), file=info_f) archive_chunk_lengths.append(length) this_num_egs = int((args.frames_per_iter / length) + 1) - this_egs = [ ] # A 2-tuple of the form (utt-id, start-frame) + this_egs = [] # A 2-tuple of the form (utt-id, start-frame) spkrs = args.num_repeats * list(spk2utt.keys()) random.shuffle(spkrs) for n in range(this_num_egs): @@ -259,14 +326,16 @@ def main(): utt = get_random_utt(spkr, spk2utt, length) utt_len = utt2len[utt] offset = get_random_offset(utt_len, length) - this_egs.append( (utt, offset) ) + this_egs.append((utt, offset)) all_egs.append(this_egs) info_f.close() # work out how many archives we assign to each job in an equitable way. - num_archives_per_job = [ 0 ] * args.num_jobs + num_archives_per_job = [0] * args.num_jobs for i in range(0, args.num_archives): - num_archives_per_job[i % args.num_jobs] = num_archives_per_job[i % args.num_jobs] + 1 + num_archives_per_job[i % args.num_jobs] = ( + num_archives_per_job[i % args.num_jobs] + 1 + ) pdf2num = {} cur_archive = 0 @@ -278,48 +347,80 @@ def main(): for i in range(0, this_num_archives): this_archives_for_job.append(cur_archive) for (utterance_index, offset) in all_egs[cur_archive]: - this_ranges.append( (utterance_index, i, offset) ) + this_ranges.append((utterance_index, i, offset)) cur_archive = cur_archive + 1 f = open(args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1), "w") if f is None: - sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "ranges." + str(job + 1)) + sys.exit( + "Error opening file " + + args.egs_dir + + "/temp/" + + prefix + + "ranges." + + str(job + 1) + ) for (utterance_index, i, offset) in sorted(this_ranges): archive_index = this_archives_for_job[i] - print("{0} {1} {2} {3} {4} {5}".format(utterance_index, - i, - archive_index + 1, - offset, - archive_chunk_lengths[archive_index], - utt2spk[utterance_index]), - file=f) + print( + "{0} {1} {2} {3} {4} {5}".format( + utterance_index, + i, + archive_index + 1, + offset, + archive_chunk_lengths[archive_index], + utt2spk[utterance_index], + ), + file=f, + ) if utt2spk[utterance_index] in pdf2num: - pdf2num[utt2spk[utterance_index]] += 1 + pdf2num[utt2spk[utterance_index]] += 1 else: pdf2num[utt2spk[utterance_index]] = 1 f.close() - f = open(args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1), "w") if f is None: - sys.exit("Error opening file " + args.egs_dir + "/temp/" + prefix + "outputs." + str(job + 1)) - print( " ".join([ str("{0}/" + prefix + "egs_temp.{1}.ark").format(args.egs_dir, n + 1) for n in this_archives_for_job ]), - file=f) + sys.exit( + "Error opening file " + + args.egs_dir + + "/temp/" + + prefix + + "outputs." + + str(job + 1) + ) + print( + " ".join( + [ + str("{0}/" + prefix + "egs_temp.{1}.ark").format( + args.egs_dir, n + 1 + ) + for n in this_archives_for_job + ] + ), + file=f, + ) f.close() f = open(args.egs_dir + "/" + prefix + "pdf2num", "w") nums = [] for k in range(0, args.num_pdfs): if k in pdf2num: - nums.append(pdf2num[k]) + nums.append(pdf2num[k]) else: - nums.append(0) + nums.append(0) print(" ".join(map(str, nums)), file=f) f.close() - print("allocate_egs.py: finished generating " + prefix + "ranges.* and " + prefix + "outputs.* files") + print( + "allocate_egs.py: finished generating " + + prefix + + "ranges.* and " + + prefix + + "outputs.* files" + ) + if __name__ == "__main__": main() - diff --git a/egs/sre20-cts/v1/local/score_dcf.py b/egs/sre20-cts/v1/local/score_dcf.py index 4026d7c9..1137e049 100755 --- a/egs/sre20-cts/v1/local/score_dcf.py +++ b/egs/sre20-cts/v1/local/score_dcf.py @@ -20,48 +20,54 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0]) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py b/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py index 8b1c02d0..a5373bf4 100755 --- a/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py +++ b/egs/sre20-cts/v1/steps_be/apply-ahc-v1.py @@ -14,14 +14,16 @@ import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.hyp_defs import float_cpu, config_logger from hyperion.helpers import VectorClassReader as VCR -#from hyperion.utils.trial_ndx import TrialNdx -#from hyperion.utils.trial_scores import TrialScores -#from hyperion.helpers import TrialDataReader as TDR + +# from hyperion.utils.trial_ndx import TrialNdx +# from hyperion.utils.trial_scores import TrialScores +# from hyperion.helpers import TrialDataReader as TDR from hyperion.helpers import PLDAFactory as F from hyperion.transforms import TransformList from hyperion.score_norm import AdaptSNorm as SNorm @@ -29,12 +31,24 @@ from hyperion.utils import Utt2Info from hyperion.classifiers import BinaryLogisticRegression as LR -def apply_ahc(v_file, input_list, output_list, - preproc_file, model_file, plda_type, - cal_file, score_hist_file, threshold, - pool_method, coh_nbest, class_prefix, **kwargs): - logging.info('loading data') +def apply_ahc( + v_file, + input_list, + output_list, + preproc_file, + model_file, + plda_type, + cal_file, + score_hist_file, + threshold, + pool_method, + coh_nbest, + class_prefix, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -45,88 +59,104 @@ def apply_ahc(v_file, input_list, output_list, vcr = VCR(v_file, input_list, preproc=preproc, **vcr_args) x, class_ids = vcr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x, x) - #scores = model.llr_NvsM(x, x, method=pool_method, ids1=class_ids, ids2=class_ids) - + # scores = model.llr_NvsM(x, x, method=pool_method, ids1=class_ids, ids2=class_ids) + dt = time.time() - t1 - num_trials = x.shape[0] **2 - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + num_trials = x.shape[0] ** 2 + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest) scores = snorm.predict(scores, scores, scores) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) if cal_file is not None: - logging.info('load calibration model: %s' % cal_file) + logging.info("load calibration model: %s" % cal_file) lr = LR.load(cal_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scores.ravel()) scores = np.reshape(s_cal, scores.shape) if score_hist_file is not None: - plt.hist(scores.ravel(), 1000, histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5) - plt.axvline(x=threshold, color='k') - plt.xlabel('LLR score') + plt.hist( + scores.ravel(), + 1000, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + ) + plt.axvline(x=threshold, color="k") + plt.xlabel("LLR score") plt.grid(True) # plt.legend() plt.savefig(score_hist_file) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - ahc = AHC(method='average', metric='llr') + ahc = AHC(method="average", metric="llr") ahc.fit(scores) - class_ids_ahc = ahc.get_flat_clusters(threshold, criterion='threshold') + class_ids_ahc = ahc.get_flat_clusters(threshold, criterion="threshold") - logging.info('saving clustering to %s' % (output_list)) + logging.info("saving clustering to %s" % (output_list)) if class_prefix is not None: - class_ids_ahc = np.asarray(['%s-%06d' % (class_prefix, i) for i in class_ids_ahc]) + class_ids_ahc = np.asarray( + ["%s-%06d" % (class_prefix, i) for i in class_ids_ahc] + ) u2c_out = Utt2Info.create(vcr.u2c.key, class_ids_ahc) - u2c_out.save(output_list, sep=' ') + u2c_out.save(output_list, sep=" ") + - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Does AHC from PLDA + SNorm + Calibration scores') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Does AHC from PLDA + SNorm + Calibration scores", + ) - parser.add_argument('--v-file', required=True) - parser.add_argument('--input-list', required=True) - parser.add_argument('--output-list', required=True) - parser.add_argument('--preproc-file', default=None) + parser.add_argument("--v-file", required=True) + parser.add_argument("--input-list", required=True) + parser.add_argument("--output-list", required=True) + parser.add_argument("--preproc-file", default=None) VCR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg']) - - parser.add_argument('--cal-file', default=None) - parser.add_argument('--score-hist-file', default=None) - parser.add_argument('--coh-nbest', type=int, default=100) - parser.add_argument('--class-prefix', default=None) - parser.add_argument('--threshold', type=float, default=0) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + ) + + parser.add_argument("--cal-file", default=None) + parser.add_argument("--score-hist-file", default=None) + parser.add_argument("--coh-nbest", type=int, default=100) + parser.add_argument("--class-prefix", default=None) + parser.add_argument("--threshold", type=float, default=0) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) apply_ahc(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py b/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py index 0eaabd1f..fb5dd6f9 100755 --- a/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-calibration-v1.py @@ -24,45 +24,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py b/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py index eeb4b21f..e3d1db91 100755 --- a/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-calibration-v2.py @@ -22,58 +22,61 @@ from hyperion.classifiers import BinaryLogisticRegression as LR from hyperion.utils import Utt2Info + def eval_calibration(in_score_file, ndx_file, model_file, cond_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) enr2cond = Utt2Info.load(cond_file) - conds, cond_ids = np.unique(enr2cond.filter(ndx.model_set).info, return_inverse=True) + conds, cond_ids = np.unique( + enr2cond.filter(ndx.model_set).info, return_inverse=True + ) num_conds = len(conds) - - logging.info('apply calibration') + + logging.info("apply calibration") scores_cal = np.zeros_like(scr.scores) for cond in range(num_conds): idx = cond_ids == cond - model_file_cond = '{}-{}.h5'.format(model_file, cond) - logging.info('load model: %s' % model_file_cond) + model_file_cond = "{}-{}.h5".format(model_file, cond) + logging.info("load model: %s" % model_file_cond) lr = LR.load(model_file_cond) s_cal = lr.predict(scr.scores.ravel()) scores_cal_cond = np.reshape(s_cal, scr.scores.shape) scores_cal[idx] = scores_cal_cond[idx] scr.scores = scores_cal - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') - - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--cond-file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) + + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--cond-file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py b/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py index fcd3b572..0d67a741 100755 --- a/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-fusion-v1.py @@ -22,52 +22,54 @@ def eval_fusion(in_score_files, ndx_file, model_file, out_score_file, fus_idx): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - num_systems=len(in_score_files) + num_systems = len(in_score_files) in_scores = [] for i in range(num_systems): - logging.info('load scores: %s' % in_score_files[i]) + logging.info("load scores: %s" % in_score_files[i]) scr = TrialScores.load_txt(in_score_files[i]) scr = scr.align_with_ndx(ndx) - in_scores.append(scr.scores.ravel()[:,None]) + in_scores.append(scr.scores.ravel()[:, None]) in_scores = np.concatenate(tuple(in_scores), axis=1) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) gf = GF.load(model_file) - logging.info('apply fusion') + logging.info("apply fusion") s_fus = gf.predict(in_scores, fus_idx=fus_idx) scr.scores = np.reshape(s_fus, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear fusion from greedy fusion trainer') - - parser.add_argument('--in-score-files', dest='in_score_files', required=True, nargs='+') - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--fus-idx', dest='fus_idx', required=True, type=int) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Evals linear fusion from greedy fusion trainer", + ) + + parser.add_argument( + "--in-score-files", dest="in_score_files", required=True, nargs="+" + ) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--fus-idx", dest="fus_idx", required=True, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_fusion(**vars(args)) - + eval_fusion(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py index 24640d8c..651a1b7f 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-snorm-v1.py @@ -24,13 +24,23 @@ from hyperion.score_norm import AdaptSNorm as SNorm from hyperion.helpers import VectorReader as VR -def eval_plda_e(x_e, x_t, back_end_dir, enroll_id, - x_coh, preproc_basename, plda_basename, - plda_type, pool_method, coh_nbest): + +def eval_plda_e( + x_e, + x_t, + back_end_dir, + enroll_id, + x_coh, + preproc_basename, + plda_basename, + plda_type, + pool_method, + coh_nbest, +): if preproc_basename is not None: preproc_file = Path(back_end_dir, enroll_id, preproc_basename) - logging.info('loading preproc transform: %s' % (preproc_file)) + logging.info("loading preproc transform: %s" % (preproc_file)) preproc = TransformList.load(preproc_file) x_e = preproc.predict(x_e) x_t = preproc.predict(x_t) @@ -38,93 +48,122 @@ def eval_plda_e(x_e, x_t, back_end_dir, enroll_id, ids_e = np.zeros((x_e.shape[0],), dtype=np.int) model_file = Path(back_end_dir, enroll_id, plda_basename) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_Nvs1(x_e, x_coh, method=pool_method, ids1=ids_e) - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) return scores -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - coh_iv_file, coh_list, coh_nbest, - back_end_dir, preproc_basename, plda_basename, - score_file, plda_type, pool_method, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + coh_iv_file, + coh_list, + coh_nbest, + back_end_dir, + preproc_basename, + plda_basename, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") tdr = TDR(iv_file, ndx_file, enroll_file, test_file, None) x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, None) x_coh = vr.read() t1 = time.time() scores = np.zeros((len(enroll), x_t.shape[0]), dtype=float_cpu()) for i in range(len(enroll)): - enroll_i=enroll[i] + enroll_i = enroll[i] mask_i = ids_e == i x_e_i = x_e[mask_i] - scores_i = eval_plda_e(x_e_i, x_t, back_end_dir, enroll_i, x_coh, - preproc_basename, plda_basename, - plda_type, pool_method, coh_nbest) + scores_i = eval_plda_e( + x_e_i, + x_t, + back_end_dir, + enroll_i, + x_coh, + preproc_basename, + plda_basename, + plda_type, + pool_method, + coh_nbest, + ) scores[i] = scores_i - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with model adapted to the enrollment cuts') - - parser.add_argument('--iv-file', required=True) - parser.add_argument('--ndx-file', default=None) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--test-file', default=None) - parser.add_argument('--coh-iv-file', required=True) - parser.add_argument('--coh-list', required=True) - parser.add_argument('--coh-nbest', type=int, default=100) - parser.add_argument('--preproc-basename', default=None) - parser.add_argument('--plda-basename', default='plda.h5') - parser.add_argument('--back-end-dir', required=True) - parser.add_argument('--plda-type', default='splda', - choices=['frplda', 'splda', 'plda'], - help=('PLDA type')) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with model adapted to the enrollment cuts", + ) + + parser.add_argument("--iv-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-file", default=None) + parser.add_argument("--coh-iv-file", required=True) + parser.add_argument("--coh-list", required=True) + parser.add_argument("--coh-nbest", type=int, default=100) + parser.add_argument("--preproc-basename", default=None) + parser.add_argument("--plda-basename", default="plda.h5") + parser.add_argument("--back-end-dir", required=True) + parser.add_argument( + "--plda-type", + default="splda", + choices=["frplda", "splda", "plda"], + help=("PLDA type"), + ) TDR.add_argparse_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg']) - - parser.add_argument('--score-file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + ) + + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py index b4f9c64c..49ad3b42 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-knn-v1.py @@ -23,32 +23,49 @@ from hyperion.transforms import TransformList -def eval_plda_e(x_e, x_t, back_end_dir, enroll_id, - preproc_basename, plda_basename, - plda_type, pool_method): +def eval_plda_e( + x_e, + x_t, + back_end_dir, + enroll_id, + preproc_basename, + plda_basename, + plda_type, + pool_method, +): if preproc_basename is not None: preproc_file = Path(back_end_dir, enroll_id, preproc_basename) - logging.info('loading preproc transform: %s' % (preproc_file)) + logging.info("loading preproc transform: %s" % (preproc_file)) preproc = TransformList.load(preproc_file) x_e = preproc.predict(x_e) x_t = preproc.predict(x_t) ids_e = np.zeros((x_e.shape[0],), dtype=np.int) model_file = Path(back_end_dir, enroll_id, plda_basename) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) return scores -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - back_end_dir, preproc_basename, plda_basename, - score_file, plda_type, pool_method, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + back_end_dir, + preproc_basename, + plda_basename, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") tdr = TDR(iv_file, ndx_file, enroll_file, test_file, None) x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) @@ -56,57 +73,72 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, t1 = time.time() scores = np.zeros((len(enroll), x_t.shape[0]), dtype=float_cpu()) for i in range(len(enroll)): - enroll_i=enroll[i] + enroll_i = enroll[i] mask_i = ids_e == i x_e_i = x_e[mask_i] - scores_i = eval_plda_e(x_e_i, x_t, back_end_dir, enroll_i, - preproc_basename, plda_basename, - plda_type, pool_method) + scores_i = eval_plda_e( + x_e_i, + x_t, + back_end_dir, + enroll_i, + preproc_basename, + plda_basename, + plda_type, + pool_method, + ) scores[i] = scores_i - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with model adapted to the enrollment cuts') - - parser.add_argument('--iv-file', required=True) - parser.add_argument('--ndx-file', default=None) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--test-file', default=None) - parser.add_argument('--preproc-basename', default=None) - parser.add_argument('--plda-basename', default='plda.h5') - parser.add_argument('--back-end-dir', required=True) - parser.add_argument('--plda-type', default='splda', - choices=['frplda', 'splda', 'plda'], - help=('PLDA type')) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with model adapted to the enrollment cuts", + ) + + parser.add_argument("--iv-file", required=True) + parser.add_argument("--ndx-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-file", default=None) + parser.add_argument("--preproc-basename", default=None) + parser.add_argument("--plda-basename", default="plda.h5") + parser.add_argument("--back-end-dir", required=True) + parser.add_argument( + "--plda-type", + default="splda", + choices=["frplda", "splda", "plda"], + help=("PLDA type"), + ) TDR.add_argparse_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg']) - - parser.add_argument('--score-file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + ) + + parser.add_argument("--score-file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py index b4304981..ac6710ad 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v1.py @@ -24,14 +24,22 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, - model_file, score_file, plda_type, - pool_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + model_file, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -41,79 +49,86 @@ def eval_plda(iv_file, ndx_file, enroll_file, x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_Nvs1(x_e, x_coh, method=pool_method, ids1=ids_e) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 telephone condition with S-Norm') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 telephone condition with S-Norm", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py index 775b7e9f..7430caf4 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-snorm-v2.py @@ -23,11 +23,20 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, preproc_file, - coh_iv_file, coh_list, coh_nbest, - score_file, pool_method, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + score_file, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,74 +47,83 @@ def eval_plda(iv_file, ndx_file, enroll_file, preproc_file, enroll, ids_e = np.unique(enroll, return_inverse=True) t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") D_e = PLDA.compute_stats_hard(x_e, class_ids=ids_e) - x_e = D_e[1]/np.expand_dims(D_e[0], axis=-1) + x_e = D_e[1] / np.expand_dims(D_e[0], axis=-1) scores = cosine_scoring(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('loading cohort data') + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = cosine_scoring(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = cosine_scoring(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval cosine scoring with S-Norm') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval cosine scoring with S-Norm", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', choices=['vavg','savg'], help=('pool method')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["vavg", "savg"], + help=("pool method"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py index 10794df3..fb2904b1 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v1.py @@ -21,13 +21,20 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - pool_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -37,54 +44,59 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA for SR18 telephone condition') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA for SR18 telephone condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py index 11cd9b2c..9eaea8b5 100755 --- a/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py +++ b/egs/sre20-cts/v1/steps_be/eval-tel-be-v2.py @@ -22,10 +22,18 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, score_file, pool_method, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + pool_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -36,51 +44,56 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, enroll, ids_e = np.unique(enroll, return_inverse=True) t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") D_e = PLDA.compute_stats_hard(x_e, class_ids=ids_e) - x_e=D_e[1]/np.expand_dims(D_e[0], axis=-1) + x_e = D_e[1] / np.expand_dims(D_e[0], axis=-1) scores = cosine_scoring(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval cosine scoring') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval cosine scoring", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['vavg','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["vavg", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/sre20-cts/v1/steps_be/train-calibration-v1.py b/egs/sre20-cts/v1/steps_be/train-calibration-v1.py index 23db5935..779e62af 100755 --- a/egs/sre20-cts/v1/steps_be/train-calibration-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-calibration-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -27,64 +24,71 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - logging.info('train calibration') + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) - lr = LR(prior=prior, lambda_reg=lambda_reg, bias_scaling=1, solver='liblinear', verbose=verbose) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('--lambda-reg', dest='lambda_reg', type=float, - default=1e-5) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-calibration-v2.py b/egs/sre20-cts/v1/steps_be/train-calibration-v2.py index 5819083c..16d09e3a 100755 --- a/egs/sre20-cts/v1/steps_be/train-calibration-v2.py +++ b/egs/sre20-cts/v1/steps_be/train-calibration-v2.py @@ -29,106 +29,125 @@ def train_calibration_cond(cond, scr, key, model_file, prior, lambda_reg, verbos ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('cond %s min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (cond, min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - logging.info('train calibration') + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "cond %s min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (cond, min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) - lr = LR(prior=prior, lambda_reg=lambda_reg, bias_scaling=1, solver='liblinear', verbose=verbose) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) lr.fit(x, y) - model_file = '{}-{}.h5'.format(model_file, cond) - logging.info('save calibration at %s' % model_file) + model_file = "{}-{}.h5".format(model_file, cond) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('cond %s act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (cond, act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "cond %s act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (cond, act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) return tar_cal, non_cal -def train_calibration(score_file, key_file, model_file, cond_file, prior, lambda_reg, verbose): +def train_calibration( + score_file, key_file, model_file, cond_file, prior, lambda_reg, verbose +): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) enr2cond = Utt2Info.load(cond_file) - conds, cond_ids = np.unique(enr2cond.filter(key.model_set).info, return_inverse=True) + conds, cond_ids = np.unique( + enr2cond.filter(key.model_set).info, return_inverse=True + ) num_conds = len(conds) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) del tar, non - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('global result before calibration') - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info("global result before calibration") + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) tar_cal = [] non_cal = [] for cond in range(num_conds): - logging.info('train calibration cond %d' % (cond)) - model_set_cond = key.model_set[cond_ids==cond] + logging.info("train calibration cond %d" % (cond)) + model_set_cond = key.model_set[cond_ids == cond] key_cond = key.filter(model_set_cond, key.seg_set) tar_cal_cond, non_cal_cond = train_calibration_cond( - conds[cond], scr, key_cond, model_file, prior, lambda_reg, verbose) + conds[cond], scr, key_cond, model_file, prior, lambda_reg, verbose + ) tar_cal.append(tar_cal_cond) non_cal.append(non_cal_cond) tar_cal = np.concatenate(tuple(tar_cal), axis=-1) non_cal = np.concatenate(tuple(non_cal), axis=-1) - logging.info('global result after calibration') + logging.info("global result after calibration") min_dcf, p_miss, p_fa = compute_min_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration with multiple enrollment conditions') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--cond-file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('--lambda-reg', dest='lambda_reg', type=float, - default=1e-5) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration with multiple enrollment conditions", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--cond-file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-fusion-v1.py b/egs/sre20-cts/v1/steps_be/train-fusion-v1.py index 46b30741..a76b2b6c 100755 --- a/egs/sre20-cts/v1/steps_be/train-fusion-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-fusion-v1.py @@ -21,41 +21,62 @@ from hyperion.classifiers import GreedyFusionBinaryLR as GF -def train_fusion(score_files, system_names, key_file, model_file, prior, prior_eval, - lambda_reg, solver, max_systems, verbose): - num_systems=len(score_files) - assert num_systems == len(system_names), 'len(score_files)(%d) != len(system_names)(%d)' % ( - num_systems, len(system_names)) - - logging.info('load key: %s' % key_file) +def train_fusion( + score_files, + system_names, + key_file, + model_file, + prior, + prior_eval, + lambda_reg, + solver, + max_systems, + verbose, +): + num_systems = len(score_files) + assert num_systems == len( + system_names + ), "len(score_files)(%d) != len(system_names)(%d)" % ( + num_systems, + len(system_names), + ) + + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) tar = [] non = [] for i in range(num_systems): - logging.info('load scores: %s' % score_files[i]) + logging.info("load scores: %s" % score_files[i]) scr = TrialScores.load_txt(score_files[i]) tar_i, non_i = scr.get_tar_non(key) - tar.append(tar_i[:,None]) - non.append(non_i[:,None]) - + tar.append(tar_i[:, None]) + non.append(non_i[:, None]) + tar = np.concatenate(tuple(tar), axis=1) non = np.concatenate(tuple(non), axis=1) ntar = tar.shape[0] nnon = non.shape[0] - logging.info('train fusion') + logging.info("train fusion") x = np.concatenate((tar, non), axis=0) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) - gf = GF(system_names=system_names, prior=prior, prior_eval=prior_eval, - lambda_reg=lambda_reg, solver=solver, max_systems=max_systems, - verbose=verbose) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + gf = GF( + system_names=system_names, + prior=prior, + prior_eval=prior_eval, + lambda_reg=lambda_reg, + solver=solver, + max_systems=max_systems, + verbose=verbose, + ) gf.fit(x, y) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) gf.save(model_file) - logging.info('fuse scores') + logging.info("fuse scores") tar_fus = gf.predict(tar) non_fus = gf.predict(non) for i in range(len(tar_fus)): @@ -67,49 +88,54 @@ def train_fusion(score_files, system_names, key_file, model_file, prior, prior_e p_miss = p_miss[None] p_fa = p_fa[None] - info_str='' + info_str = "" for j in range(len(gf.prior_eval)): - n_miss = p_miss[j]*ntar - n_fa = p_fa[j]*nnon - info_str='%s (p=%.3f) min_dcf: %.3f act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % ( - info_str, gf.prior_eval[j], min_dcf[j], act_dcf[j], p_miss[j]*100, p_fa[j]*100, n_miss, n_fa) - - logging.info('Best-%d %s' % (i+1, info_str)) - + n_miss = p_miss[j] * ntar + n_fa = p_fa[j] * nnon + info_str = "%s (p=%.3f) min_dcf: %.3f act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" % ( + info_str, + gf.prior_eval[j], + min_dcf[j], + act_dcf[j], + p_miss[j] * 100, + p_fa[j] * 100, + n_miss, + n_fa, + ) + + logging.info("Best-%d %s" % (i + 1, info_str)) - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains greedy binary logistic regression fusion') - - parser.add_argument('--score-files', dest='score_files', nargs='+', required=True) - parser.add_argument('--system-names', dest='system_names', nargs='+', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('--prior-eval', dest='prior_eval', type=float, nargs='+', - default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - parser.add_argument('--lambda-reg', dest='lambda_reg', type=float, - default=1e-5) - parser.add_argument('--solver', dest='solver', - choices=['liblinear', 'newton-cg', 'lbfgs', 'sag', 'saga'], - default='liblinear') - parser.add_argument('--max-systems', dest='max_systems', type=int, - default=10) - - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains greedy binary logistic regression fusion", + ) + + parser.add_argument("--score-files", dest="score_files", nargs="+", required=True) + parser.add_argument("--system-names", dest="system_names", nargs="+", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument( + "--prior-eval", dest="prior_eval", type=float, nargs="+", default=None + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "--solver", + dest="solver", + choices=["liblinear", "newton-cg", "lbfgs", "sag", "saga"], + default="liblinear", + ) + parser.add_argument("--max-systems", dest="max_systems", type=int, default=10) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_fusion(**vars(args)) - + train_fusion(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py index ea2cfadb..a024281a 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v1.py @@ -21,16 +21,27 @@ from numpy.linalg import matrix_rank - -def train_be(x_et, x_trn, class_ids_trn, u2c_trn, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn, - output_path): + +def train_be( + x_et, + x_trn, + class_ids_trn, + u2c_trn, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn, + output_path, +): t1 = time.time() x_et_avg = np.mean(x_et, axis=0, keepdims=True) D_trn = PLDA.compute_stats_hard(x_trn, class_ids=class_ids_trn) - x_trn_avg = D_trn[1]/np.expand_dims(D_trn[0], axis=-1) + x_trn_avg = D_trn[1] / np.expand_dims(D_trn[0], axis=-1) scores = cosine_scoring(x_et_avg, x_trn_avg) cohort_class_idx = np.argsort(-scores[0])[:k_nn] cohort_seg_mask = np.zeros((x_trn.shape[0],), dtype=np.bool) @@ -43,47 +54,45 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, u2c_trn = Utt2Info(u2c_trn.utt_info[cohort_seg_mask]) x = np.concatenate((x_trn, x_et), axis=0) - class_ids_et = (np.max(class_ids_trn)+1)*np.ones((x_et.shape[0],), dtype=np.int) + class_ids_et = (np.max(class_ids_trn) + 1) * np.ones((x_et.shape[0],), dtype=np.int) class_ids = np.concatenate((class_ids_trn, class_ids_et), axis=0) - _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) - + _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) + t1 = time.time() rank = matrix_rank(x) pca = None - logging.info('x rank=%d' % (rank)) + logging.info("x rank=%d" % (rank)) if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if lda_dim > rank: lda_dim = rank if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train LDA - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, epochs=epochs, - ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models if pca is None: @@ -94,21 +103,34 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, if not os.path.exists(output_path): os.makedirs(output_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - - u2c_trn.save(output_path + '/knn') - - -def train_bes(v_file_train, train_list, - v_file_enroll_test, enroll_test_list, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn, - output_path, part_idx, num_parts, **kwargs): + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + + u2c_trn.save(output_path + "/knn") + + +def train_bes( + v_file_train, + train_list, + v_file_enroll_test, + enroll_test_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn, + output_path, + part_idx, + num_parts, + **kwargs +): # Read train data vcr_args = VCR.filter_args(**kwargs) @@ -116,52 +138,63 @@ def train_bes(v_file_train, train_list, x_trn, class_ids_trn = vcr.read() u2c_trn = vcr.u2c del vcr - + reader = DRF.create(v_file_enroll_test) u2c = Utt2Info.load(enroll_test_list) u2c = u2c.split(part_idx, num_parts, group_by_field=1) class_names_et, class_ids_et = np.unique(u2c.info, return_inverse=True) num_classes_et = np.max(class_ids_et) + 1 for c in range(num_classes_et): - logging.info('Training PLDA for %s' % (class_names_et[c])) + logging.info("Training PLDA for %s" % (class_names_et[c])) sel_idx = class_ids_et == c key_c = u2c.key[sel_idx] x_et = reader.read(key_c, squeeze=True) - output_path_c = output_path + '/' + class_names_et[c] - train_be(x_et, x_trn, class_ids_trn, u2c_trn, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn, - output_path_c) + output_path_c = output_path + "/" + class_names_et[c] + train_be( + x_et, + x_trn, + class_ids_trn, + u2c_trn, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn, + output_path_c, + ) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train a Back-end for each trial side using kNN') - - parser.add_argument('--v-file-train', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--v-file-enroll-test', required=True) - parser.add_argument('--enroll-test-list', required=True) - + fromfile_prefix_chars="@", + description="Train a Back-end for each trial side using kNN", + ) + + parser.add_argument("--v-file-train", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--v-file-enroll-test", required=True) + parser.add_argument("--enroll-test-list", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', required=True) - parser.add_argument('--lda-dim', type=int, default=150) - parser.add_argument('--k-nn', type=int, default=500) - parser.add_argument('--part-idx', type=int, default=1) - parser.add_argument('--num-parts', type=int, default=1) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", required=True) + parser.add_argument("--lda-dim", type=int, default=150) + parser.add_argument("--k-nn", type=int, default=500) + parser.add_argument("--part-idx", type=int, default=1) + parser.add_argument("--num-parts", type=int, default=1) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_bes(**vars(args)) - + train_bes(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py index 2e4eba80..568e7edf 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v3.py @@ -21,17 +21,33 @@ from numpy.linalg import matrix_rank, svd - -def train_be(x_et, x_trn, class_ids_trn, u2c_trn, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn_1, k_nn_2, pca_var_r, - w_mu, w_B, w_W, output_path): + +def train_be( + x_et, + x_trn, + class_ids_trn, + u2c_trn, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn_1, + k_nn_2, + pca_var_r, + w_mu, + w_B, + w_W, + output_path, +): t1 = time.time() # Select training cohort (closest speakers) x_et_avg = np.mean(x_et, axis=0, keepdims=True) D_trn = PLDA.compute_stats_hard(x_trn, class_ids=class_ids_trn) - x_trn_avg = D_trn[1]/np.expand_dims(D_trn[0], axis=-1) + x_trn_avg = D_trn[1] / np.expand_dims(D_trn[0], axis=-1) scores = cosine_scoring(x_et_avg, x_trn_avg) cohort_class_idx = np.argsort(-scores[0])[:k_nn_1] cohort_seg_mask = np.zeros((x_trn.shape[0],), dtype=np.bool) @@ -45,67 +61,66 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, u2c_trn = Utt2Info(u2c_trn.utt_info[cohort_seg_mask]) x = np.concatenate((x_trn, x_et), axis=0) - class_ids_et = (np.max(class_ids_trn)+1)*np.ones((x_et.shape[0],), dtype=np.int) + class_ids_et = (np.max(class_ids_trn) + 1) * np.ones((x_et.shape[0],), dtype=np.int) class_ids = np.concatenate((class_ids_trn, class_ids_et), axis=0) - _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) + _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) - logging.info('Select num_spks={} num_segments={}'.format(np.max(class_ids)+1, x.shape[0])) + logging.info( + "Select num_spks={} num_segments={}".format(np.max(class_ids) + 1, x.shape[0]) + ) # Training prior PLDA model t1 = time.time() if pca_var_r == 1: rank = matrix_rank(x) else: sv = svd(x, compute_uv=False) - Ecc = np.cumsum(sv**2) - Ecc = Ecc/Ecc[-1] - #logging.info('sv={} Ecc={}'.format(sv, Ecc)) + Ecc = np.cumsum(sv ** 2) + Ecc = Ecc / Ecc[-1] + # logging.info('sv={} Ecc={}'.format(sv, Ecc)) rank = np.where(Ecc > pca_var_r)[0][0] pca = None - logging.info('x rank=%d' % (rank)) + logging.info("x rank=%d" % (rank)) if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if lda_dim > rank: lda_dim = rank if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train LDA - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, epochs=epochs, - ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) if not os.path.exists(output_path): os.makedirs(output_path) - u2c_trn.save(output_path + '/knn_1') + u2c_trn.save(output_path + "/knn_1") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_1.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_1.csv", elbo, delimiter=",") - # Select adaptaton cohort x_trn_ln = x_ln[:n_trn] x_et_ln = x_ln[n_trn:] @@ -125,11 +140,15 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, u2c_trn = Utt2Info(u2c_trn.utt_info[cohort_seg_mask]) x = np.concatenate((x_trn, x_et), axis=0) - class_ids_et = (np.max(class_ids_trn)+1)*np.ones((x_et.shape[0],), dtype=np.int) + class_ids_et = (np.max(class_ids_trn) + 1) * np.ones((x_et.shape[0],), dtype=np.int) class_ids = np.concatenate((class_ids_trn, class_ids_et), axis=0) - _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) - - logging.info('Select num_spks={} num_segments={} {}'.format(np.max(class_ids)+1, x.shape[0], k_nn_2)) + _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) + + logging.info( + "Select num_spks={} num_segments={} {}".format( + np.max(class_ids) + 1, x.shape[0], k_nn_2 + ) + ) # Adapt PLDA if pca: x = pca.predict(x) @@ -142,31 +161,48 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, else: preproc = TransformList([pca, lda, lnorm]) - preproc.save(output_path + '/lda_lnorm.h5') - + preproc.save(output_path + "/lda_lnorm.h5") + x_ln = lnorm.predict(x_lda) plda_adapt1 = plda.copy() - if np.max(class_ids)+1 < plda.y_dim: + if np.max(class_ids) + 1 < plda.y_dim: plda.update_V = False elbo = plda.fit(x_ln, class_ids, epochs=epochs) plda_adapt1.weighted_avg_model(plda, w_mu, w_B, w_W) - plda_adapt1.save(output_path + '/plda.h5') + plda_adapt1.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_2.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_2.csv", elbo, delimiter=",") - u2c_trn.save(output_path + '/knn_2') + u2c_trn.save(output_path + "/knn_2") - -def train_bes(v_file_train, train_list, - v_file_enroll_test, enroll_test_list, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn_1, k_nn_2, - w_mu, w_B, w_W, pca_var_r, - output_path, part_idx, num_parts, **kwargs): + +def train_bes( + v_file_train, + train_list, + v_file_enroll_test, + enroll_test_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn_1, + k_nn_2, + w_mu, + w_B, + w_W, + pca_var_r, + output_path, + part_idx, + num_parts, + **kwargs +): # Read train data vcr_args = VCR.filter_args(**kwargs) @@ -174,57 +210,73 @@ def train_bes(v_file_train, train_list, x_trn, class_ids_trn = vcr.read() u2c_trn = vcr.u2c del vcr - + reader = DRF.create(v_file_enroll_test) u2c = Utt2Info.load(enroll_test_list) u2c = u2c.split(part_idx, num_parts, group_by_field=1) class_names_et, class_ids_et = np.unique(u2c.info, return_inverse=True) num_classes_et = np.max(class_ids_et) + 1 for c in range(num_classes_et): - logging.info('Training PLDA for %s' % (class_names_et[c])) + logging.info("Training PLDA for %s" % (class_names_et[c])) sel_idx = class_ids_et == c key_c = u2c.key[sel_idx] x_et = reader.read(key_c, squeeze=True) - output_path_c = output_path + '/' + class_names_et[c] - train_be(x_et, x_trn, class_ids_trn, u2c_trn, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn_1, k_nn_2, pca_var_r, - w_mu, w_B, w_W, output_path_c) + output_path_c = output_path + "/" + class_names_et[c] + train_be( + x_et, + x_trn, + class_ids_trn, + u2c_trn, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn_1, + k_nn_2, + pca_var_r, + w_mu, + w_B, + w_W, + output_path_c, + ) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train a Back-end for each trial side using kNN') - - parser.add_argument('--v-file-train', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--v-file-enroll-test', required=True) - parser.add_argument('--enroll-test-list', required=True) - + fromfile_prefix_chars="@", + description="Train a Back-end for each trial side using kNN", + ) + + parser.add_argument("--v-file-train", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--v-file-enroll-test", required=True) + parser.add_argument("--enroll-test-list", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', required=True) - parser.add_argument('--lda-dim', type=int, default=150) - parser.add_argument('--k-nn-1', type=int, default=500) - parser.add_argument('--k-nn-2', type=int, default=600) - parser.add_argument('--pca-var-r', type=float, default=1) - parser.add_argument('--w-mu', dest='w_mu', type=float, default=1) - parser.add_argument('--w-b', dest='w_B', type=float, default=0.5) - parser.add_argument('--w-w', dest='w_W', type=float, default=0.5) - parser.add_argument('--part-idx', type=int, default=1) - parser.add_argument('--num-parts', type=int, default=1) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", required=True) + parser.add_argument("--lda-dim", type=int, default=150) + parser.add_argument("--k-nn-1", type=int, default=500) + parser.add_argument("--k-nn-2", type=int, default=600) + parser.add_argument("--pca-var-r", type=float, default=1) + parser.add_argument("--w-mu", dest="w_mu", type=float, default=1) + parser.add_argument("--w-b", dest="w_B", type=float, default=0.5) + parser.add_argument("--w-w", dest="w_W", type=float, default=0.5) + parser.add_argument("--part-idx", type=int, default=1) + parser.add_argument("--num-parts", type=int, default=1) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_bes(**vars(args)) - + train_bes(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py index 67eb54cd..7633cf17 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-knn-v4.py @@ -32,61 +32,107 @@ from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.metrics import CategoricalAccuracy + def train_dplda(x, class_ids, plda_init, output_path): - batch_size = min(int(2**13), len(x)) - batch_size = min(int(2**12), len(x)) + batch_size = min(int(2 ** 13), len(x)) + batch_size = min(int(2 ** 12), len(x)) batch_size = 512 dataset = EmbedDataset(x, class_ids) - sampler = ClassWeightedEmbedSampler(dataset, batch_size=batch_size, num_egs_per_class=4) - train_loader = torch.utils.data.DataLoader( - dataset, batch_sampler = sampler) + sampler = ClassWeightedEmbedSampler( + dataset, batch_size=batch_size, num_egs_per_class=4 + ) + train_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler) - sampler_val = ClassWeightedEmbedSampler(dataset, batch_size=batch_size, num_egs_per_class=4) - val_loader = torch.utils.data.DataLoader( - dataset, batch_sampler = sampler_val) + sampler_val = ClassWeightedEmbedSampler( + dataset, batch_size=batch_size, num_egs_per_class=4 + ) + val_loader = torch.utils.data.DataLoader(dataset, batch_sampler=sampler_val) N, F, _ = SPLDA.compute_stats_hard(x, class_ids) - x_ref = F/N[:,None] - # model = TSPLDA(mu=plda_init.mu, V=plda_init.V, W=plda_init.W, - # num_classes=np.max(class_ids)+1, x_ref=x_ref, lnorm=True, - # margin_multi=0.3, margin_tar=0.5, margin_non=0.5, + x_ref = F / N[:, None] + # model = TSPLDA(mu=plda_init.mu, V=plda_init.V, W=plda_init.W, + # num_classes=np.max(class_ids)+1, x_ref=x_ref, lnorm=True, + # margin_multi=0.3, margin_tar=0.5, margin_non=0.5, # margin_warmup_epochs=5, adapt_margin=True) - model = TSPLDA(mu=plda_init.mu, V=plda_init.V, W=plda_init.W, - num_classes=np.max(class_ids)+1, x_ref=x_ref, lnorm=True, - margin_multi=0, margin_tar=30, margin_non=30, adapt_margin=False, margin_warmup_epochs=25) #it was 50 + model = TSPLDA( + mu=plda_init.mu, + V=plda_init.V, + W=plda_init.W, + num_classes=np.max(class_ids) + 1, + x_ref=x_ref, + lnorm=True, + margin_multi=0, + margin_tar=30, + margin_non=30, + adapt_margin=False, + margin_warmup_epochs=25, + ) # it was 50 + + optimizer = OF.create( + model.parameters(), opt_type="sgd", lr=0.1, momentum=0.5, nesterov=True + ) + lr_sch = LRSF.create( + optimizer, + lrsch_type="exp_lr", + decay_rate=0.5, + decay_steps=1000, + hold_steps=1000, + min_lr=1e-5, + warmup_steps=10, + update_lr_on_opt_step=True, + ) + metrics = {"acc": CategoricalAccuracy()} - optimizer = OF.create(model.parameters(), opt_type='sgd', lr=0.1, momentum=0.5, nesterov=True) - lr_sch = LRSF.create(optimizer, lrsch_type='exp_lr', - decay_rate=0.5, decay_steps=1000, hold_steps=1000, - min_lr=1e-5, warmup_steps=10, update_lr_on_opt_step=True) - metrics = { 'acc': CategoricalAccuracy() } - trainer = PLDATrainer( - model, optimizer, epochs=25, - device='cpu', metrics=metrics, lr_scheduler=lr_sch, - loss_weights={'multi': 0, 'bin': 1}, p_tar=0.05, - exp_path=output_path + '/dplda') - + model, + optimizer, + epochs=25, + device="cpu", + metrics=metrics, + lr_scheduler=lr_sch, + loss_weights={"multi": 0, "bin": 1}, + p_tar=0.05, + exp_path=output_path + "/dplda", + ) + trainer.fit(train_loader, val_loader) - plda = SPLDA(mu=model.mu.detach().cpu().numpy(), - V=model.V.detach().cpu().numpy(), - W=model.W.detach().cpu().numpy()) + plda = SPLDA( + mu=model.mu.detach().cpu().numpy(), + V=model.V.detach().cpu().numpy(), + W=model.W.detach().cpu().numpy(), + ) return plda - -def train_be(x_et, x_trn, class_ids_trn, u2c_trn, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn_1, k_nn_2, pca_var_r, - w_mu, w_B, w_W, output_path): + +def train_be( + x_et, + x_trn, + class_ids_trn, + u2c_trn, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn_1, + k_nn_2, + pca_var_r, + w_mu, + w_B, + w_W, + output_path, +): t1 = time.time() # Select training cohort (closest speakers) x_et_avg = np.mean(x_et, axis=0, keepdims=True) D_trn = PLDA.compute_stats_hard(x_trn, class_ids=class_ids_trn) - x_trn_avg = D_trn[1]/np.expand_dims(D_trn[0], axis=-1) + x_trn_avg = D_trn[1] / np.expand_dims(D_trn[0], axis=-1) scores = cosine_scoring(x_et_avg, x_trn_avg) cohort_class_idx = np.argsort(-scores[0])[:k_nn_1] cohort_seg_mask = np.zeros((x_trn.shape[0],), dtype=np.bool) @@ -100,68 +146,67 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, u2c_trn = Utt2Info(u2c_trn.utt_info[cohort_seg_mask]) x = np.concatenate((x_trn, x_et), axis=0) - class_ids_et = (np.max(class_ids_trn)+1)*np.ones((x_et.shape[0],), dtype=np.int) + class_ids_et = (np.max(class_ids_trn) + 1) * np.ones((x_et.shape[0],), dtype=np.int) class_ids = np.concatenate((class_ids_trn, class_ids_et), axis=0) - _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) + _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) - logging.info('Select num_spks={} num_segments={}'.format(np.max(class_ids)+1, x.shape[0])) + logging.info( + "Select num_spks={} num_segments={}".format(np.max(class_ids) + 1, x.shape[0]) + ) # Training prior PLDA model t1 = time.time() if pca_var_r == 1: rank = matrix_rank(x) else: sv = svd(x, compute_uv=False) - Ecc = np.cumsum(sv**2) - Ecc = Ecc/Ecc[-1] - #logging.info('sv={} Ecc={}'.format(sv, Ecc)) + Ecc = np.cumsum(sv ** 2) + Ecc = Ecc / Ecc[-1] + # logging.info('sv={} Ecc={}'.format(sv, Ecc)) rank = np.where(Ecc > pca_var_r)[0][0] pca = None - logging.info('x rank=%d' % (rank)) + logging.info("x rank=%d" % (rank)) if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if lda_dim > rank: lda_dim = rank if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train LDA - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, epochs=epochs, - ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) plda = train_dplda(x_ln, class_ids, plda, output_path) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) if not os.path.exists(output_path): os.makedirs(output_path) - u2c_trn.save(output_path + '/knn_1') + u2c_trn.save(output_path + "/knn_1") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_1.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_1.csv", elbo, delimiter=",") - # Select adaptaton cohort x_trn_ln = x_ln[:n_trn] x_et_ln = x_ln[n_trn:] @@ -181,11 +226,15 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, u2c_trn = Utt2Info(u2c_trn.utt_info[cohort_seg_mask]) x = np.concatenate((x_trn, x_et), axis=0) - class_ids_et = (np.max(class_ids_trn)+1)*np.ones((x_et.shape[0],), dtype=np.int) + class_ids_et = (np.max(class_ids_trn) + 1) * np.ones((x_et.shape[0],), dtype=np.int) class_ids = np.concatenate((class_ids_trn, class_ids_et), axis=0) - _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) - - logging.info('Select num_spks={} num_segments={} {}'.format(np.max(class_ids)+1, x.shape[0], k_nn_2)) + _, class_ids = np.unique(class_ids, return_inverse=True) # make classids 0-(N-1) + + logging.info( + "Select num_spks={} num_segments={} {}".format( + np.max(class_ids) + 1, x.shape[0], k_nn_2 + ) + ) # Adapt PLDA if pca: x = pca.predict(x) @@ -198,32 +247,49 @@ def train_be(x_et, x_trn, class_ids_trn, u2c_trn, else: preproc = TransformList([pca, lda, lnorm]) - preproc.save(output_path + '/lda_lnorm.h5') - + preproc.save(output_path + "/lda_lnorm.h5") + x_ln = lnorm.predict(x_lda) plda_adapt1 = plda.copy() - if np.max(class_ids)+1 < plda.y_dim: + if np.max(class_ids) + 1 < plda.y_dim: plda.update_V = False elbo = plda.fit(x_ln, class_ids, epochs=epochs) plda = train_dplda(x_ln, class_ids, plda, output_path) plda_adapt1.weighted_avg_model(plda, w_mu, w_B, w_W) - plda_adapt1.save(output_path + '/plda.h5') + plda_adapt1.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_2.csv', elbo, delimiter=',') - - u2c_trn.save(output_path + '/knn_2') - - -def train_bes(v_file_train, train_list, - v_file_enroll_test, enroll_test_list, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn_1, k_nn_2, - w_mu, w_B, w_W, pca_var_r, - output_path, part_idx, num_parts, **kwargs): + np.savetxt(output_path + "/elbo_2.csv", elbo, delimiter=",") + + u2c_trn.save(output_path + "/knn_2") + + +def train_bes( + v_file_train, + train_list, + v_file_enroll_test, + enroll_test_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn_1, + k_nn_2, + w_mu, + w_B, + w_W, + pca_var_r, + output_path, + part_idx, + num_parts, + **kwargs +): # Read train data vcr_args = VCR.filter_args(**kwargs) @@ -231,57 +297,73 @@ def train_bes(v_file_train, train_list, x_trn, class_ids_trn = vcr.read() u2c_trn = vcr.u2c del vcr - + reader = DRF.create(v_file_enroll_test) u2c = Utt2Info.load(enroll_test_list) u2c = u2c.split(part_idx, num_parts, group_by_field=1) class_names_et, class_ids_et = np.unique(u2c.info, return_inverse=True) num_classes_et = np.max(class_ids_et) + 1 for c in range(num_classes_et): - logging.info('Training PLDA for %s' % (class_names_et[c])) + logging.info("Training PLDA for %s" % (class_names_et[c])) sel_idx = class_ids_et == c key_c = u2c.key[sel_idx] x_et = reader.read(key_c, squeeze=True) - output_path_c = output_path + '/' + class_names_et[c] - train_be(x_et, x_trn, class_ids_trn, u2c_trn, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, k_nn_1, k_nn_2, pca_var_r, - w_mu, w_B, w_W, output_path_c) + output_path_c = output_path + "/" + class_names_et[c] + train_be( + x_et, + x_trn, + class_ids_trn, + u2c_trn, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + k_nn_1, + k_nn_2, + pca_var_r, + w_mu, + w_B, + w_W, + output_path_c, + ) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train a Back-end for each trial side using kNN') - - parser.add_argument('--v-file-train', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--v-file-enroll-test', required=True) - parser.add_argument('--enroll-test-list', required=True) - + fromfile_prefix_chars="@", + description="Train a Back-end for each trial side using kNN", + ) + + parser.add_argument("--v-file-train", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--v-file-enroll-test", required=True) + parser.add_argument("--enroll-test-list", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', required=True) - parser.add_argument('--lda-dim', type=int, default=150) - parser.add_argument('--k-nn-1', type=int, default=500) - parser.add_argument('--k-nn-2', type=int, default=600) - parser.add_argument('--pca-var-r', type=float, default=1) - parser.add_argument('--w-mu', dest='w_mu', type=float, default=1) - parser.add_argument('--w-b', dest='w_B', type=float, default=0.5) - parser.add_argument('--w-w', dest='w_W', type=float, default=0.5) - parser.add_argument('--part-idx', type=int, default=1) - parser.add_argument('--num-parts', type=int, default=1) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", required=True) + parser.add_argument("--lda-dim", type=int, default=150) + parser.add_argument("--k-nn-1", type=int, default=500) + parser.add_argument("--k-nn-2", type=int, default=600) + parser.add_argument("--pca-var-r", type=float, default=1) + parser.add_argument("--w-mu", dest="w_mu", type=float, default=1) + parser.add_argument("--w-b", dest="w_B", type=float, default=0.5) + parser.add_argument("--w-w", dest="w_W", type=float, default=0.5) + parser.add_argument("--part-idx", type=int, default=1) + parser.add_argument("--num-parts", type=int, default=1) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_bes(**vars(args)) - + train_bes(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py index 0537c0e7..a388fb88 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v1.py @@ -17,11 +17,20 @@ from numpy.linalg import matrix_rank -def train_be(iv_file, train_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - output_path, **kwargs): + +def train_be( + iv_file, + train_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) @@ -33,39 +42,37 @@ def train_be(iv_file, train_list, pca = None if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if lda_dim > rank: lda_dim = rank if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train LDA - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models if pca is None: @@ -76,32 +83,30 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE20 telephone condition') + fromfile_prefix_chars="@", + description="Train Back-end for SRE20 telephone condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - args=parser.parse_args() - - train_be(**vars(args)) - + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + args = parser.parse_args() + + train_be(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py index b81617f2..ac5bfa7e 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v3.py @@ -18,11 +18,25 @@ from numpy.linalg import matrix_rank -def train_be(iv_file, train_list, adapt_iv_file, adapt_list, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - w_mu, w_B, w_W, - output_path, **kwargs): + +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + w_mu, + w_B, + w_W, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) @@ -35,38 +49,36 @@ def train_be(iv_file, train_list, adapt_iv_file, adapt_list, pca = None if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if lda_dim > rank: lda_dim = rank if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models if pca is None: @@ -80,13 +92,13 @@ def train_be(iv_file, train_list, adapt_iv_file, adapt_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Load labeled adapt data vcr = VCR(adapt_iv_file, adapt_list, None) x_adapt, class_ids_adapt = vcr.read() @@ -102,47 +114,45 @@ def train_be(iv_file, train_list, adapt_iv_file, adapt_list, else: preproc = TransformList([pca, lda, lnorm]) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") + x_adapt_ln = lnorm.predict(x_adapt_lda) plda_adapt1 = plda.copy() - if np.max(class_ids_adapt)+1 < plda.y_dim: + if np.max(class_ids_adapt) + 1 < plda.y_dim: plda.update_V = False elbo = plda.fit(x_adapt_ln, class_ids_adapt, epochs=20) plda_adapt1.weighted_avg_model(plda, w_mu, w_B, w_W) - plda_adapt1.save(output_path + '/plda_adapt.h5') + plda_adapt1.save(output_path + "/plda_adapt.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo_adapt.csv", elbo, delimiter=",") + - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE20 telephone condition') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) - + fromfile_prefix_chars="@", + description="Train Back-end for SRE20 telephone condition", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', required=True) - parser.add_argument('--lda-dim', type=int, default=150) - parser.add_argument('--w-mu', dest='w_mu', type=float, default=1) - parser.add_argument('--w-b', dest='w_B', type=float, default=0.5) - parser.add_argument('--w-w', dest='w_W', type=float, default=0.5) - - args=parser.parse_args() - - train_be(**vars(args)) - + parser.add_argument("--output-path", required=True) + parser.add_argument("--lda-dim", type=int, default=150) + parser.add_argument("--w-mu", dest="w_mu", type=float, default=1) + parser.add_argument("--w-b", dest="w_B", type=float, default=0.5) + parser.add_argument("--w-w", dest="w_W", type=float, default=0.5) + + args = parser.parse_args() + + train_be(**vars(args)) diff --git a/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py b/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py index 671252d5..7326d649 100755 --- a/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py +++ b/egs/sre20-cts/v1/steps_be/train-tel-be-v4.py @@ -18,15 +18,28 @@ from numpy.linalg import matrix_rank -def train_be(iv_file, train_list, - adapt_iv_file, adapt_list, - lda_dim, plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - w_mu, w_B, w_W, - w_coral_mu, w_coral_T, - output_path, **kwargs): - - # Read train data +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + w_mu, + w_B, + w_W, + w_coral_mu, + w_coral_T, + output_path, + **kwargs +): + + # Read train data vcr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr.read() @@ -39,7 +52,7 @@ def train_be(iv_file, train_list, pca = None if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) x_adapt = pca.predict(x_adapt) @@ -47,7 +60,7 @@ def train_be(iv_file, train_list, lda_dim = rank if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train CORAL t1 = time.time() @@ -62,7 +75,7 @@ def train_be(iv_file, train_list, # Train LDA x_lab_tot = np.concatenate((x_coral, x_adapt), axis=0) class_ids_lab_tot = np.concatenate((class_ids, class_ids_adapt)) - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x_lab_tot, class_ids_lab_tot) del x_lab_tot @@ -70,12 +83,12 @@ def train_be(iv_file, train_list, x_lda = lda.predict(x_coral) x_adapt_lda = lda.predict(x_adapt) - logging.info('LDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA Elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening t1 = time.time() x_lda_all = np.concatenate((x_lda, x_adapt_lda), axis=0) - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda_all) del x_lda_all @@ -90,18 +103,18 @@ def train_be(iv_file, train_list, # Apply lnorm to in-domain x_adapt_ln = lnorm_in.predict(x_adapt_lda) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() x_lab_ln = np.concatenate((x_ln, x_adapt_ln), axis=0) class_ids_lab_tot = np.concatenate((class_ids, class_ids_adapt)) - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_lab_ln, class_ids_lab_tot, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit( + x_lab_ln, class_ids_lab_tot, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs + ) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models if pca is None: @@ -112,62 +125,56 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") if pca is None: preproc = TransformList([lda, lnorm_in]) else: preproc = TransformList([pca, lda, lnorm_in]) - preproc.save(output_path + '/lda_lnorm_adapt.h5') - + preproc.save(output_path + "/lda_lnorm_adapt.h5") + plda_adapt1 = plda.copy() - if np.max(class_ids_adapt)+1 < plda.y_dim: + if np.max(class_ids_adapt) + 1 < plda.y_dim: plda.update_V = False elbo = plda.fit(x_adapt_ln, class_ids_adapt, epochs=20) plda_adapt1.weighted_avg_model(plda, w_mu, w_B, w_W) - plda_adapt1.save(output_path + '/plda_adapt.h5') + plda_adapt1.save(output_path + "/plda_adapt.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt.csv", elbo, delimiter=",") + - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE20 telephone condition') - - parser.add_argument('--iv-file', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--adapt-iv-file', required=True) - parser.add_argument('--adapt-list', required=True) - + fromfile_prefix_chars="@", + description="Train Back-end for SRE20 telephone condition", + ) + + parser.add_argument("--iv-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--adapt-iv-file", required=True) + parser.add_argument("--adapt-list", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - parser.add_argument('--w-coral-mu', dest='w_coral_mu', type=float, - default=0.5) - parser.add_argument('--w-coral-t', dest='w_coral_T', type=float, - default=0.75) - parser.add_argument('--w-mu', dest='w_mu', type=float, - default=1) - parser.add_argument('--w-b', dest='w_B', type=float, - default=1) - parser.add_argument('--w-w', dest='w_W', type=float, - default=1) - args=parser.parse_args() - + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--w-coral-mu", dest="w_coral_mu", type=float, default=0.5) + parser.add_argument("--w-coral-t", dest="w_coral_T", type=float, default=0.75) + parser.add_argument("--w-mu", dest="w_mu", type=float, default=1) + parser.add_argument("--w-b", dest="w_B", type=float, default=1) + parser.add_argument("--w-w", dest="w_W", type=float, default=1) + args = parser.parse_args() + train_be(**vars(args)) diff --git a/egs/sre21-av-a/v1.16k/local/prepare_sre21av_eval_audio.py b/egs/sre21-av-a/v1.16k/local/prepare_sre21av_eval_audio.py index cca67cbc..f8943907 100755 --- a/egs/sre21-av-a/v1.16k/local/prepare_sre21av_eval_audio.py +++ b/egs/sre21-av-a/v1.16k/local/prepare_sre21av_eval_audio.py @@ -250,7 +250,8 @@ def prepare_sre21av_eval_audio( segments_file = corpus_dir / "docs" / "sre21_eval_segment_key.tsv" df_segms = pd.read_csv(segments_file, sep="\t") df_segms.rename( - columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, inplace=True, + columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, + inplace=True, ) df_segms.replace({"language": "english"}, {"language": "ENG"}, inplace=True) df_segms.replace({"language": "cantonese"}, {"language": "YUE"}, inplace=True) @@ -259,12 +260,14 @@ def prepare_sre21av_eval_audio( enroll_file = corpus_dir / "docs" / "sre21_audio_eval_enrollment.tsv" df_enr = pd.read_csv(enroll_file, sep="\t") df_enr.rename( - columns={"segmentid": "segment_id", "modelid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "modelid": "model_id"}, + inplace=True, ) key_file = corpus_dir / "docs" / "sre21_audio_eval_trial_key.tsv" df_key = pd.read_csv(key_file, sep="\t") df_key.rename( - columns={"segmentid": "segment_id", "modelid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "modelid": "model_id"}, + inplace=True, ) df_model = make_enroll_dir(df_segms, df_enr, wav_dir, target_fs, output_path) @@ -275,7 +278,8 @@ def prepare_sre21av_eval_audio( key_file = corpus_dir / "docs" / "sre21_audio-visual_eval_trial_key.tsv" df_key = pd.read_csv(key_file, sep="\t") df_key.rename( - columns={"segmentid": "segment_id", "modelid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "modelid": "model_id"}, + inplace=True, ) wav_dir = corpus_dir / "data" / "video" make_test_dir( diff --git a/egs/sre21-av-v/v0.1/local/prepare_sre21av_dev_visual.py b/egs/sre21-av-v/v0.1/local/prepare_sre21av_dev_visual.py index a594b6bc..b2e1f9a4 100755 --- a/egs/sre21-av-v/v0.1/local/prepare_sre21av_dev_visual.py +++ b/egs/sre21-av-v/v0.1/local/prepare_sre21av_dev_visual.py @@ -84,13 +84,15 @@ def prepare_sre21av_dev_visual(corpus_dir, output_path, verbose): segments_file = corpus_dir / "docs" / "sre21_dev_segment_key.tsv" df_segms = pd.read_csv(segments_file, sep="\t") df_segms.rename( - columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, inplace=True, + columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, + inplace=True, ) key_file = corpus_dir / "docs" / "sre21_visual_dev_trial_key.tsv" df_key = pd.read_csv(key_file, sep="\t") df_key.rename( - columns={"segmentid": "segment_id", "imageid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "imageid": "model_id"}, + inplace=True, ) make_enroll_dir(df_segms, img_dir, output_path) diff --git a/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual.py b/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual.py index 51c1c492..e681e26e 100755 --- a/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual.py +++ b/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual.py @@ -84,13 +84,15 @@ def prepare_sre21av_eval_visual(corpus_dir, output_path, verbose): segments_file = corpus_dir / "docs" / "sre21_eval_segment_key.tsv" df_segms = pd.read_csv(segments_file, sep="\t") df_segms.rename( - columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, inplace=True, + columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, + inplace=True, ) key_file = corpus_dir / "docs" / "sre21_visual_eval_trial_key.tsv" df_key = pd.read_csv(key_file, sep="\t") df_key.rename( - columns={"segmentid": "segment_id", "imageid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "imageid": "model_id"}, + inplace=True, ) make_enroll_dir(df_segms, img_dir, output_path) diff --git a/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual_nokey.py b/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual_nokey.py index b2753ce7..7efccbef 100755 --- a/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual_nokey.py +++ b/egs/sre21-av-v/v0.1/local/prepare_sre21av_eval_visual_nokey.py @@ -69,7 +69,8 @@ def prepare_sre21av_eval_visual(corpus_dir, output_path, verbose): key_file = corpus_dir / "docs" / "sre21_visual_eval_trials.tsv" df_trials = pd.read_csv(key_file, sep="\t") df_trials.rename( - columns={"segmentid": "segment_id", "imageid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "imageid": "model_id"}, + inplace=True, ) make_enroll_dir(df_trials, img_dir, output_path) @@ -79,7 +80,8 @@ def prepare_sre21av_eval_visual(corpus_dir, output_path, verbose): df_trials = pd.read_csv(key_file, sep="\t") df_trials = df_trials.drop("modelid", axis=1).drop_duplicates() df_trials.rename( - columns={"segmentid": "segment_id", "imageid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "imageid": "model_id"}, + inplace=True, ) test_dir = Path(output_path + "_test") df_trials.to_csv(test_dir / "trials_av.csv", sep=",", index=False) diff --git a/egs/sre21-av-v/v0.1/local/score_dcf.py b/egs/sre21-av-v/v0.1/local/score_dcf.py index 4fffc9e8..514ebf51 100755 --- a/egs/sre21-av-v/v0.1/local/score_dcf.py +++ b/egs/sre21-av-v/v0.1/local/score_dcf.py @@ -24,48 +24,54 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0]) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f}".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/sre21-av/v1/local/prepare_sre21av_dev.py b/egs/sre21-av/v1/local/prepare_sre21av_dev.py index bbe82f25..2560b8c9 100755 --- a/egs/sre21-av/v1/local/prepare_sre21av_dev.py +++ b/egs/sre21-av/v1/local/prepare_sre21av_dev.py @@ -185,7 +185,8 @@ def prepare_sre21av_dev(corpus_dir, output_path, verbose): segments_file = corpus_dir / "docs" / "sre21_dev_segment_key.tsv" df_segms = pd.read_csv(segments_file, sep="\t") df_segms.rename( - columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, inplace=True, + columns={"segmentid": "segment_id", "subjectid": "speaker_id"}, + inplace=True, ) df_segms.replace({"language": "english"}, {"language": "ENG"}, inplace=True) df_segms.replace({"language": "cantonese"}, {"language": "YUE"}, inplace=True) @@ -194,7 +195,11 @@ def prepare_sre21av_dev(corpus_dir, output_path, verbose): enroll_file = corpus_dir / "docs" / "sre21_audio_dev_enrollment.tsv" df_enr = pd.read_csv(enroll_file, sep="\t") df_enr.rename( - columns={"segmentid": "segment_id", "modelid": "model_id",}, inplace=True, + columns={ + "segmentid": "segment_id", + "modelid": "model_id", + }, + inplace=True, ) df_model = make_enroll_dir(df_segms, df_enr) diff --git a/egs/sre21-av/v1/local/prepare_sre21av_eval_nokey.py b/egs/sre21-av/v1/local/prepare_sre21av_eval_nokey.py index 7b78eef4..9fcd1f07 100755 --- a/egs/sre21-av/v1/local/prepare_sre21av_eval_nokey.py +++ b/egs/sre21-av/v1/local/prepare_sre21av_eval_nokey.py @@ -167,7 +167,8 @@ def prepare_sre21av_eval(corpus_dir, output_path, verbose): enroll_file = corpus_dir / "docs" / "sre21_audio_eval_enrollment.tsv" df_enr = pd.read_csv(enroll_file, sep="\t") df_enr.rename( - columns={"segmentid": "segment_id", "modelid": "model_id"}, inplace=True, + columns={"segmentid": "segment_id", "modelid": "model_id"}, + inplace=True, ) df_model = make_enroll_dir(df_enr) diff --git a/egs/sre21-av/v1/local/sum_fusion.py b/egs/sre21-av/v1/local/sum_fusion.py index c0f3e0d4..4e88c17c 100755 --- a/egs/sre21-av/v1/local/sum_fusion.py +++ b/egs/sre21-av/v1/local/sum_fusion.py @@ -47,7 +47,8 @@ def sum_fusion(ndx_file, audio_scores, visual_scores, output_scores, verbose): if __name__ == "__main__": parser = ArgumentParser(description="fuses audio and visual scores") parser.add_argument( - "--ndx-file", required=True, + "--ndx-file", + required=True, ) parser.add_argument("--audio-scores", required=True) parser.add_argument("--visual-scores", required=True) diff --git a/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py b/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py index af28da7e..78231ba1 100755 --- a/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-be-snorm-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -28,13 +25,23 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -42,79 +49,84 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - - logging.info('loading plda model: %s' % (model_file)) + + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) - - logging.info('loading cohort data') + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with S-Norm') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', dest='coh_nbest_discard', type=int, default=0) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with S-Norm", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument( + "--coh-nbest-discard", dest="coh_nbest_discard", type=int, default=0 + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/voices_challenge/v0/steps_be/eval-be-v1.py b/egs/voices_challenge/v0/steps_be/eval-be-v1.py index f0b2ab9a..dc3e3f87 100755 --- a/egs/voices_challenge/v0/steps_be/eval-be-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-be-v1.py @@ -22,12 +22,19 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -36,50 +43,51 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py b/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py index cffa1241..fa16dfce 100755 --- a/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py +++ b/egs/voices_challenge/v0/steps_be/eval-calibration-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -28,45 +25,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - -if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) +if __name__ == "__main__": - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) + + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/voices_challenge/v0/steps_be/train-be-v1.py b/egs/voices_challenge/v0/steps_be/train-be-v1.py index e5c9e29c..44f93a57 100755 --- a/egs/voices_challenge/v0/steps_be/train-be-v1.py +++ b/egs/voices_challenge/v0/steps_be/train-be-v1.py @@ -21,104 +21,109 @@ from hyperion.helpers import PLDAFactory as F -def train_be(iv_file, train_list, - adapt_iv_file_1, adapt_list_1, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file_1, + adapt_list_1, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): # Read data - logging.info('loading data') + logging.info("loading data") vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr_train.read() # Train LDA - logging.info('train LDA') + logging.info("train LDA") t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening - logging.info('train length norm') + logging.info("train length norm") t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('length norm elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("length norm elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA - logging.info('train PLDA') + logging.info("train PLDA") t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA elapsed time: %.2f s.' % (time.time()-t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) # Save models - logging.info('saving models') + logging.info("saving models") preproc = TransformList(lda) preproc.append(lnorm) if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Compute mean for adapted data - logging.info('loading adaptation data') + logging.info("loading adaptation data") vr = VR(adapt_iv_file_1, adapt_list_1, None) x = vr.read() - - logging.info('adapting centering') + + logging.info("adapting centering") x = lda.predict(x) lnorm.update_T = False lnorm.fit(x) - logging.info('saving adapted LDA+LNorm model') + logging.info("saving adapted LDA+LNorm model") preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') + preproc.save(output_path + "/lda_lnorm_adapt.h5") - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file-1', dest='adapt_iv_file_1', required=True) - parser.add_argument('--adapt-list-1', dest='adapt_list_1', required=True) - + fromfile_prefix_chars="@", + description="Train Back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file-1", dest="adapt_iv_file_1", required=True) + parser.add_argument("--adapt-list-1", dest="adapt_list_1", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_be(**vars(args)) - + train_be(**vars(args)) diff --git a/egs/voices_challenge/v0/steps_be/train-be-v2.py b/egs/voices_challenge/v0/steps_be/train-be-v2.py index 8efc85f7..cd4d4470 100755 --- a/egs/voices_challenge/v0/steps_be/train-be-v2.py +++ b/egs/voices_challenge/v0/steps_be/train-be-v2.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -26,78 +23,88 @@ from hyperion.utils.utt2info import Utt2Info -def train_be(iv_file, train_list, - adapt_iv_file, adapt_list, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - adapt_y_dim, w_mu, w_B, w_W, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + adapt_y_dim, + w_mu, + w_B, + w_W, + output_path, + **kwargs +): # Read data - logging.info('loading data') + logging.info("loading data") vcr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr.read() # Train LDA - logging.info('train LDA') + logging.info("train LDA") t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening - logging.info('train length norm') + logging.info("train length norm") t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('length norm elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("length norm elapsed time: %.2f s." % (time.time() - t1)) # Train PLDA - logging.info('train PLDA') + logging.info("train PLDA") t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA elapsed time: %.2f s.' % (time.time()-t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) # Save models - logging.info('saving models') + logging.info("saving models") preproc = TransformList(lda) preproc.append(lnorm) if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # supervised adaptation - logging.info('loading adaptation data') + logging.info("loading adaptation data") vcr = VCR(adapt_iv_file, adapt_list, None) x, class_ids = vcr.read() - logging.info('adapting centering') + logging.info("adapting centering") x_lda = lda.predict(x) lnorm.update_T = False lnorm.fit(x_lda) - logging.info('saving adapted LDA+LNorm model') + logging.info("saving adapted LDA+LNorm model") preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') + preproc.save(output_path + "/lda_lnorm_adapt.h5") - logging.info('adapting PLDA') + logging.info("adapting PLDA") x_ln = lnorm.predict(x_lda) plda_adapt = plda @@ -109,53 +116,45 @@ def train_be(iv_file, train_list, plda.update_V = False elbo = plda.fit(x_ln, class_ids, epochs=20) plda_adapt.weighted_avg_model(plda, w_mu, w_B, w_W) - logging.info('PLDA elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) - logging.info('saving adapted PLDA model') - plda_adapt.save(output_path + '/plda_adapt.h5') + logging.info("saving adapted PLDA model") + plda_adapt.save(output_path + "/plda_adapt.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt.csv", elbo, delimiter=",") + - - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 telephone condition') + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 telephone condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) - VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=150) - parser.add_argument('--adapt-y-dim', dest='adapt_y_dim', - type=int, default=None) - parser.add_argument('--w-mu', dest='w_mu', type=float, - default=1) - parser.add_argument('--w-b', dest='w_B', type=float, - default=0) - parser.add_argument('--w-w', dest='w_W', type=float, - default=0.5) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--adapt-y-dim", dest="adapt_y_dim", type=int, default=None) + parser.add_argument("--w-mu", dest="w_mu", type=float, default=1) + parser.add_argument("--w-b", dest="w_B", type=float, default=0) + parser.add_argument("--w-w", dest="w_W", type=float, default=0.5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) train_be(**vars(args)) - - diff --git a/egs/voices_challenge/v0/steps_be/train-calibration-v1.py b/egs/voices_challenge/v0/steps_be/train-calibration-v1.py index a26b310b..fa1dfcf7 100755 --- a/egs/voices_challenge/v0/steps_be/train-calibration-v1.py +++ b/egs/voices_challenge/v0/steps_be/train-calibration-v1.py @@ -23,63 +23,65 @@ def train_calibration(score_file, key_file, model_file, prior, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) - logging.info('train calibration') + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) lr = LR(prior=prior, verbose=verbose) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) print(tar_cal) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/voices_challenge/v1/local/make_musan.py b/egs/voices_challenge/v1/local/make_musan.py index 8a93a18b..b0ae6846 100755 --- a/egs/voices_challenge/v1/local/make_musan.py +++ b/egs/voices_challenge/v1/local/make_musan.py @@ -7,125 +7,183 @@ import os, sys + def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals def prepare_music(root_dir, fs, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_speech(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_noise(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def main(): - in_dir = sys.argv[1] - fs = int(sys.argv[2]) - out_dir = sys.argv[3] - use_vocals = sys.argv[4] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py b/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py index af28da7e..78231ba1 100755 --- a/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-be-snorm-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -28,13 +25,23 @@ from hyperion.helpers import VectorReader as VR -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - coh_iv_file, coh_list, coh_nbest, coh_nbest_discard, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + coh_iv_file, + coh_list, + coh_nbest, + coh_nbest_discard, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -42,79 +49,84 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - - logging.info('loading plda model: %s' % (model_file)) + + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) - - logging.info('loading cohort data') + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + + logging.info("loading cohort data") vr = VR(coh_iv_file, coh_list, preproc) x_coh = vr.read() t2 = time.time() - logging.info('score cohort vs test') + logging.info("score cohort vs test") scores_coh_test = model.llr_1vs1(x_coh, x_t) - logging.info('score enroll vs cohort') + logging.info("score enroll vs cohort") scores_enr_coh = model.llr_1vs1(x_e, x_coh) dt = time.time() - t2 - logging.info('cohort-scoring elapsed time: %.2f s.' % (dt)) + logging.info("cohort-scoring elapsed time: %.2f s." % (dt)) t2 = time.time() - logging.info('apply s-norm') + logging.info("apply s-norm") snorm = SNorm(nbest=coh_nbest, nbest_discard=coh_nbest_discard) scores = snorm.predict(scores, scores_coh_test, scores_enr_coh) dt = time.time() - t2 - logging.info('s-norm elapsed time: %.2f s.' % (dt)) + logging.info("s-norm elapsed time: %.2f s." % (dt)) dt = time.time() - t1 - logging.info('total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "total-scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA with S-Norm') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--coh-iv-file', dest='coh_iv_file', required=True) - parser.add_argument('--coh-list', dest='coh_list', required=True) - parser.add_argument('--coh-nbest', dest='coh_nbest', type=int, default=100) - parser.add_argument('--coh-nbest-discard', dest='coh_nbest_discard', type=int, default=0) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA with S-Norm", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--coh-iv-file", dest="coh_iv_file", required=True) + parser.add_argument("--coh-list", dest="coh_list", required=True) + parser.add_argument("--coh-nbest", dest="coh_nbest", type=int, default=100) + parser.add_argument( + "--coh-nbest-discard", dest="coh_nbest_discard", type=int, default=0 + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/voices_challenge/v1/steps_be/eval-be-v1.py b/egs/voices_challenge/v1/steps_be/eval-be-v1.py index f0b2ab9a..dc3e3f87 100755 --- a/egs/voices_challenge/v1/steps_be/eval-be-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-be-v1.py @@ -22,12 +22,19 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -36,50 +43,51 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc) x_e, x_t, enroll, ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) - logging.info('saving scores to %s' % (score_file)) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py b/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py index cffa1241..fa16dfce 100755 --- a/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py +++ b/egs/voices_challenge/v1/steps_be/eval-calibration-v1.py @@ -7,9 +7,6 @@ """ - - - import sys import os import argparse @@ -28,45 +25,45 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - -if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals linear calibration') - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) +if __name__ == "__main__": - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Evals linear calibration", + ) + + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**vars(args)) - + eval_calibration(**vars(args)) diff --git a/egs/voices_challenge/v1/steps_be/train-be-v1.py b/egs/voices_challenge/v1/steps_be/train-be-v1.py index e5c9e29c..44f93a57 100755 --- a/egs/voices_challenge/v1/steps_be/train-be-v1.py +++ b/egs/voices_challenge/v1/steps_be/train-be-v1.py @@ -21,104 +21,109 @@ from hyperion.helpers import PLDAFactory as F -def train_be(iv_file, train_list, - adapt_iv_file_1, adapt_list_1, - lda_dim, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file_1, + adapt_list_1, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): # Read data - logging.info('loading data') + logging.info("loading data") vcr_args = VCR.filter_args(**kwargs) vcr_train = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr_train.read() # Train LDA - logging.info('train LDA') + logging.info("train LDA") t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("LDA elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening - logging.info('train length norm') + logging.info("train length norm") t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('length norm elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("length norm elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA - logging.info('train PLDA') + logging.info("train PLDA") t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA elapsed time: %.2f s.' % (time.time()-t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) # Save models - logging.info('saving models') + logging.info("saving models") preproc = TransformList(lda) preproc.append(lnorm) if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + # Compute mean for adapted data - logging.info('loading adaptation data') + logging.info("loading adaptation data") vr = VR(adapt_iv_file_1, adapt_list_1, None) x = vr.read() - - logging.info('adapting centering') + + logging.info("adapting centering") x = lda.predict(x) lnorm.update_T = False lnorm.fit(x) - logging.info('saving adapted LDA+LNorm model') + logging.info("saving adapted LDA+LNorm model") preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') + preproc.save(output_path + "/lda_lnorm_adapt.h5") - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file-1', dest='adapt_iv_file_1', required=True) - parser.add_argument('--adapt-list-1', dest='adapt_list_1', required=True) - + fromfile_prefix_chars="@", + description="Train Back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file-1", dest="adapt_iv_file_1", required=True) + parser.add_argument("--adapt-list-1", dest="adapt_list_1", required=True) + VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_be(**vars(args)) - + train_be(**vars(args)) diff --git a/egs/voices_challenge/v1/steps_be/train-be-v2.py b/egs/voices_challenge/v1/steps_be/train-be-v2.py index 08fd7078..36fbc341 100755 --- a/egs/voices_challenge/v1/steps_be/train-be-v2.py +++ b/egs/voices_challenge/v1/steps_be/train-be-v2.py @@ -22,76 +22,88 @@ from hyperion.utils.utt2info import Utt2Info -def train_be(iv_file, train_list, adapt_iv_file, adapt_list, lda_dim, - plda_type, y_dim, z_dim, epochs, ml_md, md_epochs, adapt_y_dim, - w_mu, w_B, w_W, output_path, **kwargs): +def train_be( + iv_file, + train_list, + adapt_iv_file, + adapt_list, + lda_dim, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + adapt_y_dim, + w_mu, + w_B, + w_W, + output_path, + **kwargs +): # Read data - logging.info('loading data') + logging.info("loading data") vcr_args = VCR.filter_args(**kwargs) vcr = VCR(iv_file, train_list, None, **vcr_args) x, class_ids = vcr.read() # Train LDA - logging.info('train LDA') + logging.info("train LDA") t1 = time.time() - lda = LDA(lda_dim=lda_dim, name='lda') + lda = LDA(lda_dim=lda_dim, name="lda") lda.fit(x, class_ids) x_lda = lda.predict(x) - logging.info('LDA elapsed time: %.2f s.' % (time.time() - t1)) + logging.info("LDA elapsed time: %.2f s." % (time.time() - t1)) # Train centering and whitening - logging.info('train length norm') + logging.info("train length norm") t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_lda) x_ln = lnorm.predict(x_lda) - logging.info('length norm elapsed time: %.2f s.' % (time.time() - t1)) + logging.info("length norm elapsed time: %.2f s." % (time.time() - t1)) # Train PLDA - logging.info('train PLDA') + logging.info("train PLDA") t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name='plda') - elbo = plda.fit(x_ln, - class_ids, - epochs=epochs, - ml_md=ml_md, - md_epochs=md_epochs) - logging.info('PLDA elapsed time: %.2f s.' % (time.time() - t1)) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) # Save models - logging.info('saving models') + logging.info("saving models") preproc = TransformList(lda) preproc.append(lnorm) if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lda_lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lda_lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") # supervised adaptation - logging.info('loading adaptation data') + logging.info("loading adaptation data") vcr = VCR(adapt_iv_file, adapt_list, None) x, class_ids = vcr.read() - logging.info('adapting centering') + logging.info("adapting centering") x_lda = lda.predict(x) lnorm.update_T = False lnorm.fit(x_lda) - logging.info('saving adapted LDA+LNorm model') + logging.info("saving adapted LDA+LNorm model") preproc = TransformList(lda) preproc.append(lnorm) - preproc.save(output_path + '/lda_lnorm_adapt.h5') + preproc.save(output_path + "/lda_lnorm_adapt.h5") - logging.info('adapting PLDA') + logging.info("adapting PLDA") x_ln = lnorm.predict(x_lda) plda_adapt = plda @@ -103,46 +115,41 @@ def train_be(iv_file, train_list, adapt_iv_file, adapt_list, lda_dim, plda.update_V = False elbo = plda.fit(x_ln, class_ids, epochs=20) plda_adapt.weighted_avg_model(plda, w_mu, w_B, w_W) - logging.info('PLDA elapsed time: %.2f s.' % (time.time() - t1)) + logging.info("PLDA elapsed time: %.2f s." % (time.time() - t1)) - logging.info('saving adapted PLDA model') - plda_adapt.save(output_path + '/plda_adapt.h5') + logging.info("saving adapted PLDA model") + plda_adapt.save(output_path + "/plda_adapt.h5") num = np.arange(20) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo_adapt.csv', elbo, delimiter=',') + np.savetxt(output_path + "/elbo_adapt.csv", elbo, delimiter=",") if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end for SRE18 telephone condition') + fromfile_prefix_chars="@", + description="Train Back-end for SRE18 telephone condition", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--adapt-iv-file', dest='adapt_iv_file', required=True) - parser.add_argument('--adapt-list', dest='adapt_list', required=True) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--adapt-iv-file", dest="adapt_iv_file", required=True) + parser.add_argument("--adapt-list", dest="adapt_list", required=True) VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, default=150) - parser.add_argument('--adapt-y-dim', - dest='adapt_y_dim', - type=int, - default=None) - parser.add_argument('--w-mu', dest='w_mu', type=float, default=1) - parser.add_argument('--w-b', dest='w_B', type=float, default=0) - parser.add_argument('--w-w', dest='w_W', type=float, default=0.5) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=150) + parser.add_argument("--adapt-y-dim", dest="adapt_y_dim", type=int, default=None) + parser.add_argument("--w-mu", dest="w_mu", type=float, default=1) + parser.add_argument("--w-b", dest="w_B", type=float, default=0) + parser.add_argument("--w-w", dest="w_W", type=float, default=0.5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/egs/voices_challenge/v1/steps_be/train-calibration-v1.py b/egs/voices_challenge/v1/steps_be/train-calibration-v1.py index a26b310b..fa1dfcf7 100755 --- a/egs/voices_challenge/v1/steps_be/train-calibration-v1.py +++ b/egs/voices_challenge/v1/steps_be/train-calibration-v1.py @@ -23,63 +23,65 @@ def train_calibration(score_file, key_file, model_file, prior, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) - logging.info('train calibration') + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) lr = LR(prior=prior, verbose=verbose) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) print(tar_cal) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains llr calibration') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Trains llr calibration", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**vars(args)) - + train_calibration(**vars(args)) diff --git a/egs/voxceleb/adv.v2/local/filter_attacks.py b/egs/voxceleb/adv.v2/local/filter_attacks.py index f39b83e2..853b9806 100755 --- a/egs/voxceleb/adv.v2/local/filter_attacks.py +++ b/egs/voxceleb/adv.v2/local/filter_attacks.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -16,40 +21,41 @@ from hyperion.hyp_defs import float_cpu, config_logger + def filter_attacks(input_file, output_file, field, keep, remove): - logging.info('reading {}'.format(input_file)) - with open(input_file, 'r') as f: + logging.info("reading {}".format(input_file)) + with open(input_file, "r") as f: attack_info = yaml.load(f, Loader=yaml.FullLoader) - logging.info('selecting elements to remove') + logging.info("selecting elements to remove") rem_list = [] for k, v in attack_info.items(): if not (v[field] in keep) or v[field] in remove: rem_list.append(k) - logging.info('removing elements') + logging.info("removing elements") [attack_info.pop(k) for k in rem_list] - logging.info('saving {}'.format(output_file)) - with open(output_file, 'w') as f: + logging.info("saving {}".format(output_file)) + with open(output_file, "w") as f: yaml.dump(attack_info, f, sort_keys=True) - + if __name__ == "__main__": - parser = ArgumentParser( - description='Filters attacks in yaml file') - - parser.add_argument('--input-file', required=True) - parser.add_argument('--output-file', required=True) - parser.add_argument('--field', required=True) - parser.add_argument('--keep', nargs='+', default=[]) - parser.add_argument('--remove', nargs='+', default=[]) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = ArgumentParser(description="Filters attacks in yaml file") + + parser.add_argument("--input-file", required=True) + parser.add_argument("--output-file", required=True) + parser.add_argument("--field", required=True) + parser.add_argument("--keep", nargs="+", default=[]) + parser.add_argument("--remove", nargs="+", default=[]) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_musan.py b/egs/voxceleb/adv.v2/local/make_musan.py index 8a93a18b..b0ae6846 100755 --- a/egs/voxceleb/adv.v2/local/make_musan.py +++ b/egs/voxceleb/adv.v2/local/make_musan.py @@ -7,125 +7,183 @@ import os, sys + def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals def prepare_music(root_dir, fs, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_speech(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_noise(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def main(): - in_dir = sys.argv[1] - fs = int(sys.argv[2]) - out_dir = sys.argv[3] - use_vocals = sys.argv[4] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/adv.v2/local/make_some_figs.py b/egs/voxceleb/adv.v2/local/make_some_figs.py index 6f8d6fa7..0b2b672f 100755 --- a/egs/voxceleb/adv.v2/local/make_some_figs.py +++ b/egs/voxceleb/adv.v2/local/make_some_figs.py @@ -11,258 +11,428 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import VerificationAdvAttackEvaluator as Eval - -filenames = ['voxceleb1_attack_tar_snr_results.csv', - 'voxceleb1_attack_non_snr_results.csv', - 'voxceleb1_attack_tar_linf_results.csv', - 'voxceleb1_attack_non_linf_results.csv'] - -output_dir='exp/figs/resnet34_1/' -base_res_dir = 'exp/scores/' - - - -def plot_figs1(res_dirs1, legends, title_base, fig_base, fmt=['b','r','g','m','c','y'], clean_ref=0): +from hyperion.metrics.verification_evaluator import ( + VerificationAdvAttackEvaluator as Eval, +) + +filenames = [ + "voxceleb1_attack_tar_snr_results.csv", + "voxceleb1_attack_non_snr_results.csv", + "voxceleb1_attack_tar_linf_results.csv", + "voxceleb1_attack_non_linf_results.csv", +] + +output_dir = "exp/figs/resnet34_1/" +base_res_dir = "exp/scores/" + + +def plot_figs1( + res_dirs1, + legends, + title_base, + fig_base, + fmt=["b", "r", "g", "m", "c", "y"], + clean_ref=0, +): df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[0]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[0]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends= legends, fmt=fmt, - title=title_base + ' attacks on target trials', - font_size=13) + fig_file = output_dir + fig_base + "_tar_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on target trials", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[1]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[1]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends=legends, fmt=fmt, - title=title_base + ' attacks on non-target trials', - font_size=13) - + fig_file = output_dir + fig_base + "_non_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on non-target trials", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[2]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[2]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' attacks on target trials', - font_size=13) + fig_file = output_dir + fig_base + "_tar_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on target trials", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[3]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[3]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' attacks on non-target trials', - font_size=13) - - -def plot_figs2(res_dirs1, legends, title_base, fig_base, fmt=['b','r','g','m','c','y'], clean_ref=0, colors=None): + fig_file = output_dir + fig_base + "_non_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on non-target trials", + font_size=13, + ) + + +def plot_figs2( + res_dirs1, + legends, + title_base, + fig_base, + fmt=["b", "r", "g", "m", "c", "y"], + clean_ref=0, + colors=None, +): df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[0]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[0]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends= legends, fmt=fmt, - title=title_base + ' Adv. Evasion', - font_size=13, colors=colors) + fig_file = output_dir + fig_base + "_tar_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Evasion", + font_size=13, + colors=colors, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[1]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[1]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends=legends, fmt=fmt, - title=title_base + ' Adv. Impersonation', - font_size=13, colors=colors) - + fig_file = output_dir + fig_base + "_non_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Impersonation", + font_size=13, + colors=colors, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[2]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[2]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' Adv. Evasion', - font_size=13) + fig_file = output_dir + fig_base + "_tar_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Evasion", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[3]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[3]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' Adv. Impersonation', - font_size=13) - + fig_file = output_dir + fig_base + "_non_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Impersonation", + font_size=13, + ) if __name__ == "__main__": - if not os.path.isdir(output_dir): os.makedirs(output_dir) - res_dirs0= 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2' - res_dirs1 = ['cosine_fgsm_eall', 'cosine_randfgsm_eall', 'cosine_iterfgsm_eall'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - legends = ['FGSM', 'Rand-FGSM', 'Iter-FGSM'] - plot_figs1(res_dirs1, legends, 'FGSM', 'fgsm') - plot_figs2(res_dirs1, legends, 'FGSM', 'fgsm2') - plot_figs2(res_dirs1, None, 'FGSM', 'fgsmnoleg2') - - res_dirs1 = ['cosine_cwl2_conf0', 'cosine_cwl2_conf1', 'cosine_cwlinf_conf0', 'cosine_cwlinf_conf1'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - legends = ['CW-L2 conf=0', 'CW-L2 conf=1', 'CW-Linf conf=0', 'CW-Linf conf=1'] - plot_figs1(res_dirs1, legends, 'Carlini-Wagner', 'cw') - - + res_dirs0 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2" + res_dirs1 = ["cosine_fgsm_eall", "cosine_randfgsm_eall", "cosine_iterfgsm_eall"] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + legends = ["FGSM", "Rand-FGSM", "Iter-FGSM"] + plot_figs1(res_dirs1, legends, "FGSM", "fgsm") + plot_figs2(res_dirs1, legends, "FGSM", "fgsm2") + plot_figs2(res_dirs1, None, "FGSM", "fgsmnoleg2") + + res_dirs1 = [ + "cosine_cwl2_conf0", + "cosine_cwl2_conf1", + "cosine_cwlinf_conf0", + "cosine_cwlinf_conf1", + ] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + legends = ["CW-L2 conf=0", "CW-L2 conf=1", "CW-Linf conf=0", "CW-Linf conf=1"] + plot_figs1(res_dirs1, legends, "Carlini-Wagner", "cw") ########################### - - res_dirs2 = ['resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2', - 'lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2', - 'resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2'] - legends = ['ResNet34', 'ThinResNet34', 'ResETDNN'] - res_dirs3 = [s + '/cosine_iterfgsm_eall' for s in res_dirs2] - plot_figs1(res_dirs3, legends, 'Iter-FGSM', 'iterfgsm', clean_ref=None) - plot_figs2(res_dirs3, legends, 'Iter-FGSM', 'iterfgsm2', clean_ref=None) - plot_figs2(res_dirs3, None, 'Iter-FGSM', 'iterfgsmnoleg2', clean_ref=None) - - res_dirs3 = [s + '/cosine_cwl2_conf0' for s in res_dirs2] - plot_figs1(res_dirs3, legends, 'Carlini-Wagner L2', 'cwl2', clean_ref=None) - plot_figs2(res_dirs3, legends, 'Carlini-Wagner L2', 'cwl22', clean_ref=None) + res_dirs2 = [ + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2", + "lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2", + "resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2", + ] + legends = ["ResNet34", "ThinResNet34", "ResETDNN"] + res_dirs3 = [s + "/cosine_iterfgsm_eall" for s in res_dirs2] + plot_figs1(res_dirs3, legends, "Iter-FGSM", "iterfgsm", clean_ref=None) + plot_figs2(res_dirs3, legends, "Iter-FGSM", "iterfgsm2", clean_ref=None) + plot_figs2(res_dirs3, None, "Iter-FGSM", "iterfgsmnoleg2", clean_ref=None) + + res_dirs3 = [s + "/cosine_cwl2_conf0" for s in res_dirs2] + plot_figs1(res_dirs3, legends, "Carlini-Wagner L2", "cwl2", clean_ref=None) + plot_figs2(res_dirs3, legends, "Carlini-Wagner L2", "cwl22", clean_ref=None) ########################### - - res_dirs1 = ['cosine_cwl2_conf0', 'cosine_cwl2_conf0_noabort', 'cosine_cwl2_conf0_lr0.001', 'cosine_cwl2_conf0_lr0.001_noabort', - 'cosine_cwl2_conf0_lr0.001_noabort_it20', 'cosine_cwl2_conf0_lr0.001_noabort_it40', 'cosine_cwl2_conf0_lr0.001_noabort_it80', - 'cosine_cwl2_conf0_lr0.001_it80'] - legends = ['default', 'lr=0.01 it10', 'lr=0.001 it10 abort early', 'lr=0.001 it10', 'lr=0.001 it20', 'lr=0.001 it40', 'lr=0.001 it80', - 'lr=0.001 it80 abort early'] - fmt=['b', 'r', 'g', 'm','c','y','*b','*r', '*g', '*m', '*c', '*y'] - - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs1] - - plot_figs1(res_dirs2, legends, 'Carlini-Wagner L2', 'cwl2_iters1', fmt=fmt) - - res_dirs1 = ['cosine_cwl2_conf0', 'cosine_cwl2_conf0_lr0.001_noabort', - 'cosine_cwl2_conf0_lr0.001_noabort_it20', 'cosine_cwl2_conf0_lr0.001_noabort_it40', 'cosine_cwl2_conf0_lr0.001_noabort_it80', - 'cosine_cwl2_conf0_lr0.001_it80'] - legends = ['default', 'lr=0.001 it10', 'lr=0.001 it20', 'lr=0.001 it40', 'lr=0.001 it80', - 'lr=0.001 it80 abort early'] - fmt=['b', 'r','g','m','c','y'] - - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs1] - - plot_figs1(res_dirs2, legends, 'Carlini-Wagner L2', 'cwl2_iters2', fmt=fmt) + res_dirs1 = [ + "cosine_cwl2_conf0", + "cosine_cwl2_conf0_noabort", + "cosine_cwl2_conf0_lr0.001", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwl2_conf0_lr0.001_noabort_it20", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + "cosine_cwl2_conf0_lr0.001_noabort_it80", + "cosine_cwl2_conf0_lr0.001_it80", + ] + legends = [ + "default", + "lr=0.01 it10", + "lr=0.001 it10 abort early", + "lr=0.001 it10", + "lr=0.001 it20", + "lr=0.001 it40", + "lr=0.001 it80", + "lr=0.001 it80 abort early", + ] + fmt = ["b", "r", "g", "m", "c", "y", "*b", "*r", "*g", "*m", "*c", "*y"] + + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs1] + + plot_figs1(res_dirs2, legends, "Carlini-Wagner L2", "cwl2_iters1", fmt=fmt) + + res_dirs1 = [ + "cosine_cwl2_conf0", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwl2_conf0_lr0.001_noabort_it20", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + "cosine_cwl2_conf0_lr0.001_noabort_it80", + "cosine_cwl2_conf0_lr0.001_it80", + ] + legends = [ + "default", + "lr=0.001 it10", + "lr=0.001 it20", + "lr=0.001 it40", + "lr=0.001 it80", + "lr=0.001 it80 abort early", + ] + fmt = ["b", "r", "g", "m", "c", "y"] + + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs1] + + plot_figs1(res_dirs2, legends, "Carlini-Wagner L2", "cwl2_iters2", fmt=fmt) ########################### - - res_dirs0= 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2' - res_dirs1 = ['cosine_fgsm_eall', 'cosine_randfgsm_eall', 'cosine_iterfgsm_eall', - 'cosine_cwl2_conf0_lr0.001_noabort', 'cosine_cwsnr_conf0_lr0.001_noabort_it10', - 'cosine_cwrms_conf0_lr0.001_noabort_it10', 'cosine_cwrms_conf4_lr0.001_noabort_it10', 'cosine_cwl2_conf0_lr0.001_noabort_it40'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - fmt=['ob', 'vr','^g','>y','sm','pc', 'Pc', '*r', '+g', 'Dc', 'Hm'] - fmt=['ob', 'vr','^g','>y','sm','pc', 'P', '*', '+g', 'Dc', 'Hm'] - colors=['b', 'r','g','y','m','c', 'lime', 'orange', '+g', 'Dc', 'Hm'] - legends = ['FGSM', 'Rand-FGSM', 'Iter-FGSM', 'CW-L2 k=0', 'CW-SNR k=0', - 'CW-RMS k=0', 'CW-RMS k=4', 'CW-RMS k=0 it=40'] - legends = ['FGSM', 'Rand-FGSM', 'Iter-FGSM', 'CW-L2', 'CW-SNR', - 'CW-RMS', 'CW-RMS k=4', 'CW-RMS it=40'] - - plot_figs1(res_dirs1, legends, '', 'fgsmcw', fmt=fmt) - plot_figs1(res_dirs1, None, '', 'fgsmcwnoleg', fmt=fmt) - plot_figs2(res_dirs1, legends, '', 'fgsmcw2', fmt=fmt, colors=colors) - plot_figs2(res_dirs1, None, '', 'fgsmcwnoleg2', fmt=fmt, colors=colors) - - - res_dirs0= 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2' - res_dirs1 = ['cosine_iterfgsm_eall', - 'cosine_cwl2_conf0_lr0.001_noabort', 'cosine_cwsnr_conf0_lr0.001_noabort_it10', - 'cosine_cwrms_conf0_lr0.001_noabort_it10', 'cosine_cwrms_conf4_lr0.001_noabort_it10', 'cosine_cwl2_conf0_lr0.001_noabort_it40'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - fmt=['ob', 'vr','^g','>y','sm','pc', 'Pc', '*r', '+g', 'Dc', 'Hm'] - fmt=['ob', 'vr','^g','>y','sm','pc', 'P', '*', '+g', 'Dc', 'Hm'] - colors=['b', 'r','g','y','m','c', 'lime', 'orange', '+g', 'Dc', 'Hm'] - legends = ['Iter-FGSM', 'CW-L2', 'CW-SNR', - 'CW-RMS', 'CW-RMS k=4', 'CW-RMS it=40'] - - plot_figs2(res_dirs1, legends, '', 'fgsmcw3', fmt=fmt, colors=colors) - plot_figs2(res_dirs1, None, '', 'fgsmcwnoleg3', fmt=fmt, colors=colors) - + res_dirs0 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2" + res_dirs1 = [ + "cosine_fgsm_eall", + "cosine_randfgsm_eall", + "cosine_iterfgsm_eall", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwsnr_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf4_lr0.001_noabort_it10", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + ] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "Pc", "*r", "+g", "Dc", "Hm"] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "P", "*", "+g", "Dc", "Hm"] + colors = ["b", "r", "g", "y", "m", "c", "lime", "orange", "+g", "Dc", "Hm"] + legends = [ + "FGSM", + "Rand-FGSM", + "Iter-FGSM", + "CW-L2 k=0", + "CW-SNR k=0", + "CW-RMS k=0", + "CW-RMS k=4", + "CW-RMS k=0 it=40", + ] + legends = [ + "FGSM", + "Rand-FGSM", + "Iter-FGSM", + "CW-L2", + "CW-SNR", + "CW-RMS", + "CW-RMS k=4", + "CW-RMS it=40", + ] + + plot_figs1(res_dirs1, legends, "", "fgsmcw", fmt=fmt) + plot_figs1(res_dirs1, None, "", "fgsmcwnoleg", fmt=fmt) + plot_figs2(res_dirs1, legends, "", "fgsmcw2", fmt=fmt, colors=colors) + plot_figs2(res_dirs1, None, "", "fgsmcwnoleg2", fmt=fmt, colors=colors) + + res_dirs0 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2" + res_dirs1 = [ + "cosine_iterfgsm_eall", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwsnr_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf4_lr0.001_noabort_it10", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + ] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "Pc", "*r", "+g", "Dc", "Hm"] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "P", "*", "+g", "Dc", "Hm"] + colors = ["b", "r", "g", "y", "m", "c", "lime", "orange", "+g", "Dc", "Hm"] + legends = ["Iter-FGSM", "CW-L2", "CW-SNR", "CW-RMS", "CW-RMS k=4", "CW-RMS it=40"] + + plot_figs2(res_dirs1, legends, "", "fgsmcw3", fmt=fmt, colors=colors) + plot_figs2(res_dirs1, None, "", "fgsmcwnoleg3", fmt=fmt, colors=colors) ########################### - res_dirs1 = ['cosine_iterfgsm_eall', 'cosine_iterfgsm_eall_randsmooth0.001', 'cosine_iterfgsm_eall_randsmooth0.01'] - legends = ['no-def', '$\sigma=32$', '$\sigma=327$'] - fmt=['b', 'r','g','m','c','y'] - - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs1] - - plot_figs1(res_dirs2, legends, 'IterFGSM RandSmooth', 'iterfgsm_randsmooth', fmt=fmt) - plot_figs2(res_dirs2, legends, 'IterFGSM RandSmooth', 'iterfgsm_randsmooth2', fmt=fmt) - plot_figs2(res_dirs2, None, 'IterFGSM RandSmooth', 'iterfgsm_randsmoothnoleg2', fmt=fmt) + res_dirs1 = [ + "cosine_iterfgsm_eall", + "cosine_iterfgsm_eall_randsmooth0.001", + "cosine_iterfgsm_eall_randsmooth0.01", + ] + legends = ["no-def", "$\sigma=32$", "$\sigma=327$"] + fmt = ["b", "r", "g", "m", "c", "y"] + + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs1] + + plot_figs1( + res_dirs2, legends, "IterFGSM RandSmooth", "iterfgsm_randsmooth", fmt=fmt + ) + plot_figs2( + res_dirs2, legends, "IterFGSM RandSmooth", "iterfgsm_randsmooth2", fmt=fmt + ) + plot_figs2( + res_dirs2, None, "IterFGSM RandSmooth", "iterfgsm_randsmoothnoleg2", fmt=fmt + ) ########################### - res_dirs2 = ['resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2', - 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1_ep5', - 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1'] - legends = ['No-adv', 'Adv. epoch=5', 'Adv. epoch=23'] - res_dirs3 = [s + '/cosine_fgsm_eall' for s in res_dirs2] - plot_figs1(res_dirs3, legends, 'FGSM adv. finetuning', 'fgsm_advft', clean_ref=None) + res_dirs2 = [ + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2", + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1_ep5", + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1", + ] + legends = ["No-adv", "Adv. epoch=5", "Adv. epoch=23"] + res_dirs3 = [s + "/cosine_fgsm_eall" for s in res_dirs2] + plot_figs1(res_dirs3, legends, "FGSM adv. finetuning", "fgsm_advft", clean_ref=None) ########################### - res_dirs1 = ['cosine_fgsm_eall', 'cosine_randfgsm_eall', 'cosine_iterfgsm_eall', - 'cosine_cwl2_conf0', 'cosine_cwl2_conf1', 'cosine_cwlinf_conf0', 'cosine_cwlinf_conf1'] - names = ['FGSM', 'Rand-FGSM', 'Iter-FGSM', - 'CW-L2 conf=0', 'CW-L2 conf=1', 'CW-Linf conf=0', 'CW-Linf conf=1'] - fig_names = ['fgsm', 'randfgsm', 'iterfgsm', 'cwl2_conf0', 'cwl2_conf1', 'cwlinf_conf0', 'cwlinf_conf1'] - legends = ['ResNet34 (white-box)', 'ThinResNet34', 'ResETDNN'] - fmt=['b','r','g','m','c','y'] + res_dirs1 = [ + "cosine_fgsm_eall", + "cosine_randfgsm_eall", + "cosine_iterfgsm_eall", + "cosine_cwl2_conf0", + "cosine_cwl2_conf1", + "cosine_cwlinf_conf0", + "cosine_cwlinf_conf1", + ] + names = [ + "FGSM", + "Rand-FGSM", + "Iter-FGSM", + "CW-L2 conf=0", + "CW-L2 conf=1", + "CW-Linf conf=0", + "CW-Linf conf=1", + ] + fig_names = [ + "fgsm", + "randfgsm", + "iterfgsm", + "cwl2_conf0", + "cwl2_conf1", + "cwlinf_conf0", + "cwlinf_conf1", + ] + legends = ["ResNet34 (white-box)", "ThinResNet34", "ResETDNN"] + fmt = ["b", "r", "g", "m", "c", "y"] for i in range(len(names)): - res_dirs2 = [res_dirs1[i], 'transfer.lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2/' + res_dirs1[i], - 'transfer.resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2/' + res_dirs1[i]] - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs2] - plot_figs1(res_dirs2, legends, names[i] + ' black-box', fig_names[i] + '_bbox', fmt=fmt) - plot_figs2(res_dirs2, legends, names[i] + ' black-box', fig_names[i] + '_bbox2', fmt=fmt) - - + res_dirs2 = [ + res_dirs1[i], + "transfer.lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2/" + + res_dirs1[i], + "transfer.resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2/" + + res_dirs1[i], + ] + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs2] + plot_figs1( + res_dirs2, legends, names[i] + " black-box", fig_names[i] + "_bbox", fmt=fmt + ) + plot_figs2( + res_dirs2, + legends, + names[i] + " black-box", + fig_names[i] + "_bbox2", + fmt=fmt, + ) diff --git a/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_snr_v1.py b/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_snr_v1.py index dc448385..9e678302 100755 --- a/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_snr_v1.py +++ b/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_snr_v1.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -17,21 +22,28 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import Utt2Info, SCPList -snr_levels = np.arange(0,65,10) +snr_levels = np.arange(0, 65, 10) + def quant_snr(snr): - q = np.argmin((snr_levels - snr)**2) - q_str = 'snr-%d' % (int(snr_levels[q])) + q = np.argmin((snr_levels - snr) ** 2) + q_str = "snr-%d" % (int(snr_levels[q])) return q_str -def make_lists(input_file, benign_wav_file, output_dir, - test_min_snr, test_max_snr, test_success_category): +def make_lists( + input_file, + benign_wav_file, + output_dir, + test_min_snr, + test_max_snr, + test_success_category, +): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_file, 'r') as f: + with open(input_file, "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) @@ -41,50 +53,57 @@ def make_lists(input_file, benign_wav_file, output_dir, classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (test_success_category == 'both' or - test_success_category == 'success' and s or - test_success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + test_success_category == "both" + or test_success_category == "success" + and s + or test_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < test_min_snr or snr > test_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - benign_keys.append(v['test_benign']) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + benign_keys.append(v["test_benign"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) test_u2c = Utt2Info.create(keys, classes) test_wav = SCPList(keys, files) - test_u2c.save(output_dir / 'utt2attack') - test_wav.save(output_dir / 'wav.scp') - + test_u2c.save(output_dir / "utt2attack") + test_wav.save(output_dir / "wav.scp") + if __name__ == "__main__": - parser = ArgumentParser( - description='prepare lists to test attack classification') - - parser.add_argument('--input-file', required=True) - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('--test-min-snr', default=-10, type=float) - parser.add_argument('--test-max-snr', default=100, type=float) - parser.add_argument('--test-success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = ArgumentParser(description="prepare lists to test attack classification") + + parser.add_argument("--input-file", required=True) + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--test-min-snr", default=-10, type=float) + parser.add_argument("--test-max-snr", default=100, type=float) + parser.add_argument( + "--test-success-category", + default="success", + choices=["success", "fail", "both"], + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_threat_model_v1.py b/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_threat_model_v1.py index e0324168..4d19af59 100755 --- a/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_threat_model_v1.py +++ b/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_threat_model_v1.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,13 +23,19 @@ from hyperion.utils import Utt2Info, SCPList -def make_lists(input_file, benign_wav_file, output_dir, - test_min_snr, test_max_snr, test_success_category): +def make_lists( + input_file, + benign_wav_file, + output_dir, + test_min_snr, + test_max_snr, + test_success_category, +): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_file, 'r') as f: + with open(input_file, "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) @@ -34,41 +45,44 @@ def make_lists(input_file, benign_wav_file, output_dir, classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - benign_keys.append(v['test_benign']) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + benign_keys.append(v["test_benign"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) test_u2c = Utt2Info.create(keys, classes) test_wav = SCPList(keys, files) - test_u2c.save(output_dir / 'utt2attack') - test_wav.save(output_dir / 'wav.scp') - + test_u2c.save(output_dir / "utt2attack") + test_wav.save(output_dir / "wav.scp") + if __name__ == "__main__": - parser = ArgumentParser( - description='prepare lists to test attack classification') - - parser.add_argument('--input-file', required=True) - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('--test-min-snr', default=-10, type=float) - parser.add_argument('--test-max-snr', default=100, type=float) - parser.add_argument('--test-success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = ArgumentParser(description="prepare lists to test attack classification") + + parser.add_argument("--input-file", required=True) + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--test-min-snr", default=-10, type=float) + parser.add_argument("--test-max-snr", default=100, type=float) + parser.add_argument( + "--test-success-category", + default="success", + choices=["success", "fail", "both"], + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_type_v1.py b/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_type_v1.py index e80a8b53..cd70bdaa 100755 --- a/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_type_v1.py +++ b/egs/voxceleb/adv.v2/local/make_spkverif_test_lists_exp_attack_type_v1.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,13 +23,19 @@ from hyperion.utils import Utt2Info, SCPList -def make_lists(input_file, benign_wav_file, output_dir, - test_min_snr, test_max_snr, test_success_category): +def make_lists( + input_file, + benign_wav_file, + output_dir, + test_min_snr, + test_max_snr, + test_success_category, +): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_file, 'r') as f: + with open(input_file, "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) @@ -34,50 +45,57 @@ def make_lists(input_file, benign_wav_file, output_dir, classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (test_success_category == 'both' or - test_success_category == 'success' and s or - test_success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + test_success_category == "both" + or test_success_category == "success" + and s + or test_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < test_min_snr or snr > test_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - benign_keys.append(v['test_benign']) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + benign_keys.append(v["test_benign"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) test_u2c = Utt2Info.create(keys, classes) test_wav = SCPList(keys, files) - test_u2c.save(output_dir / 'utt2attack') - test_wav.save(output_dir / 'wav.scp') - + test_u2c.save(output_dir / "utt2attack") + test_wav.save(output_dir / "wav.scp") + if __name__ == "__main__": - parser = ArgumentParser( - description='prepare lists to test attack classification') - - parser.add_argument('--input-file', required=True) - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('--test-min-snr', default=-10, type=float) - parser.add_argument('--test-max-snr', default=100, type=float) - parser.add_argument('--test-success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = ArgumentParser(description="prepare lists to test attack classification") + + parser.add_argument("--input-file", required=True) + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--test-min-snr", default=-10, type=float) + parser.add_argument("--test-max-snr", default=100, type=float) + parser.add_argument( + "--test-success-category", + default="success", + choices=["success", "fail", "both"], + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v0.py b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v0.py index dd117c07..7cde451f 100755 --- a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v0.py +++ b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v0.py @@ -17,11 +17,12 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import Utt2Info, SCPList -snr_levels = np.arange(0,65,10) +snr_levels = np.arange(0, 65, 10) + def quant_snr(snr): - q = np.argmin((snr_levels - snr)**2) - q_str = 'snr-%d' % (int(snr_levels[q])) + q = np.argmin((snr_levels - snr) ** 2) + q_str = "snr-%d" % (int(snr_levels[q])) return q_str @@ -32,34 +33,34 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'train_attack_info.yml', 'r') as f: + with open(input_dir / "train_attack_info.yml", "r") as f: train_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'val_attack_info.yml', 'r') as f: + with open(input_dir / "val_attack_info.yml", "r") as f: val_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) u2d = Utt2Info.load(benign_durs) - + keys = [] files = [] classes = [] benign_keys = [] durs = [] - for k,v in train_attacks.items(): + for k, v in train_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - benign_keys.append(v['benign_key']) - durs.append(v['num_frames']/16000) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + benign_keys.append(v["benign_key"]) + durs.append(v["num_frames"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -75,17 +76,17 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in val_attacks.items(): + for k, v in val_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - benign_keys.append(v['benign_key']) - durs.append(v['num_frames']/16000) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + benign_keys.append(v["benign_key"]) + durs.append(v["num_frames"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -100,17 +101,17 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - benign_keys.append(v['benign_key']) - durs.append(v['num_frames']/16000) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + benign_keys.append(v["benign_key"]) + durs.append(v["num_frames"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -123,40 +124,42 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): trainval_u2d = Utt2Info.merge([train_u2d, val_u2d]) ##### - train_u2c.save(output_dir / 'train_utt2attack') - val_u2c.save(output_dir / 'val_utt2attack') - test_u2c.save(output_dir / 'test_utt2attack') + train_u2c.save(output_dir / "train_utt2attack") + val_u2c.save(output_dir / "val_utt2attack") + test_u2c.save(output_dir / "test_utt2attack") - train_wav.save(output_dir / 'train_wav.scp') - val_wav.save(output_dir / 'val_wav.scp') - trainval_wav.save(output_dir / 'trainval_wav.scp') - test_wav.save(output_dir / 'test_wav.scp') + train_wav.save(output_dir / "train_wav.scp") + val_wav.save(output_dir / "val_wav.scp") + trainval_wav.save(output_dir / "trainval_wav.scp") + test_wav.save(output_dir / "test_wav.scp") - train_u2d.save(output_dir / 'train_utt2dur') - val_u2d.save(output_dir / 'val_utt2dur') - trainval_u2d.save(output_dir / 'trainval_utt2dur') - test_u2d.save(output_dir / 'test_utt2dur') + train_u2d.save(output_dir / "train_utt2dur") + val_u2d.save(output_dir / "val_utt2dur") + trainval_u2d.save(output_dir / "trainval_utt2dur") + test_u2d.save(output_dir / "test_utt2dur") - with open(output_dir / 'class2int', 'w') as f: + with open(output_dir / "class2int", "w") as f: for c in uclasses: - f.write('%s\n' % (c)) - + f.write("%s\n" % (c)) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare lists to train nnet to discriminate between attacks snr and benign speech') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--benign-durs', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare lists to train nnet to discriminate between attacks snr and benign speech", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--benign-durs", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v1.py b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v1.py index 7259da8f..bc1b354f 100755 --- a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v1.py +++ b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_snr_v1.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -17,54 +22,66 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import Utt2Info, SCPList -snr_levels = np.arange(0,65,10) +snr_levels = np.arange(0, 65, 10) + def quant_snr(snr): - q = np.argmin((snr_levels - snr)**2) - q_str = 'snr-%d' % (int(snr_levels[q])) + q = np.argmin((snr_levels - snr) ** 2) + q_str = "snr-%d" % (int(snr_levels[q])) return q_str -def make_lists(input_dir, output_dir, - train_min_snr, train_max_snr, train_success_category, - test_min_snr, test_max_snr, test_success_category): +def make_lists( + input_dir, + output_dir, + train_min_snr, + train_max_snr, + train_success_category, + test_min_snr, + test_max_snr, + test_success_category, +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'train_attack_info.yaml', 'r') as f: + with open(input_dir / "train_attack_info.yaml", "r") as f: train_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'val_attack_info.yaml', 'r') as f: + with open(input_dir / "val_attack_info.yaml", "r") as f: val_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] durs = [] - for k,v in train_attacks.items(): - s = v['success'] - if not (train_success_category == 'both' or - train_success_category == 'success' and s or - train_success_category == 'fail' and not s): + for k, v in train_attacks.items(): + s = v["success"] + if not ( + train_success_category == "both" + or train_success_category == "success" + and s + or train_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < train_min_snr or snr > train_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) train_u2c = Utt2Info.create(keys, classes) train_u2d = Utt2Info.create(keys, durs) @@ -77,24 +94,28 @@ def make_lists(input_dir, output_dir, files = [] classes = [] durs = [] - for k,v in val_attacks.items(): - s = v['success'] - if not (train_success_category == 'both' or - train_success_category == 'success' and s or - train_success_category == 'fail' and not s): + for k, v in val_attacks.items(): + s = v["success"] + if not ( + train_success_category == "both" + or train_success_category == "success" + and s + or train_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < train_min_snr or snr > train_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) val_u2c = Utt2Info.create(keys, classes) val_u2d = Utt2Info.create(keys, durs) @@ -106,24 +127,28 @@ def make_lists(input_dir, output_dir, files = [] classes = [] durs = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (test_success_category == 'both' or - test_success_category == 'success' and s or - test_success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + test_success_category == "both" + or test_success_category == "success" + and s + or test_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < test_min_snr or snr > test_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(quant_snr(v['snr'])) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(quant_snr(v["snr"])) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) test_u2c = Utt2Info.create(keys, classes) test_u2d = Utt2Info.create(keys, durs) @@ -134,45 +159,53 @@ def make_lists(input_dir, output_dir, trainval_u2d = Utt2Info.merge([train_u2d, val_u2d]) ##### - train_u2c.save(output_dir / 'train_utt2attack') - val_u2c.save(output_dir / 'val_utt2attack') - test_u2c.save(output_dir / 'test_utt2attack') + train_u2c.save(output_dir / "train_utt2attack") + val_u2c.save(output_dir / "val_utt2attack") + test_u2c.save(output_dir / "test_utt2attack") - train_wav.save(output_dir / 'train_wav.scp') - val_wav.save(output_dir / 'val_wav.scp') - trainval_wav.save(output_dir / 'trainval_wav.scp') - test_wav.save(output_dir / 'test_wav.scp') + train_wav.save(output_dir / "train_wav.scp") + val_wav.save(output_dir / "val_wav.scp") + trainval_wav.save(output_dir / "trainval_wav.scp") + test_wav.save(output_dir / "test_wav.scp") - train_u2d.save(output_dir / 'train_utt2dur') - val_u2d.save(output_dir / 'val_utt2dur') - trainval_u2d.save(output_dir / 'trainval_utt2dur') - test_u2d.save(output_dir / 'test_utt2dur') + train_u2d.save(output_dir / "train_utt2dur") + val_u2d.save(output_dir / "val_utt2dur") + trainval_u2d.save(output_dir / "trainval_utt2dur") + test_u2d.save(output_dir / "test_utt2dur") - with open(output_dir / 'class_file', 'w') as f: + with open(output_dir / "class_file", "w") as f: for c in uclasses: - f.write('%s\n' % (c)) - + f.write("%s\n" % (c)) + if __name__ == "__main__": parser = ArgumentParser( - description='prepare lists to train nnet to discriminate between attacks snr and benign speech') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('--train-min-snr', default=-10, type=float) - parser.add_argument('--train-max-snr', default=100, type=float) - parser.add_argument('--train-success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--test-min-snr', default=-10, type=float) - parser.add_argument('--test-max-snr', default=100, type=float) - parser.add_argument('--test-success-category', default='success', - choices=['success', 'fail', 'both']) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare lists to train nnet to discriminate between attacks snr and benign speech" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--train-min-snr", default=-10, type=float) + parser.add_argument("--train-max-snr", default=100, type=float) + parser.add_argument( + "--train-success-category", + default="success", + choices=["success", "fail", "both"], + ) + parser.add_argument("--test-min-snr", default=-10, type=float) + parser.add_argument("--test-max-snr", default=100, type=float) + parser.add_argument( + "--test-success-category", + default="success", + choices=["success", "fail", "both"], + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v0.py b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v0.py index 0bccce5d..e467d177 100755 --- a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v0.py +++ b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v0.py @@ -25,34 +25,34 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'train_attack_info.yml', 'r') as f: + with open(input_dir / "train_attack_info.yml", "r") as f: train_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'val_attack_info.yml', 'r') as f: + with open(input_dir / "val_attack_info.yml", "r") as f: val_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) u2d = Utt2Info.load(benign_durs) - + keys = [] files = [] classes = [] benign_keys = [] durs = [] - for k,v in train_attacks.items(): + for k, v in train_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - benign_keys.append(v['benign_key']) - durs.append(v['num_frames']/16000) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + benign_keys.append(v["benign_key"]) + durs.append(v["num_frames"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -68,17 +68,17 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in val_attacks.items(): + for k, v in val_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - benign_keys.append(v['benign_key']) - durs.append(v['num_frames']/16000) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + benign_keys.append(v["benign_key"]) + durs.append(v["num_frames"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -93,17 +93,17 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - benign_keys.append(v['benign_key']) - durs.append(v['num_frames']/16000) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + benign_keys.append(v["benign_key"]) + durs.append(v["num_frames"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -116,40 +116,42 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): trainval_u2d = Utt2Info.merge([train_u2d, val_u2d]) ##### - train_u2c.save(output_dir / 'train_utt2attack') - val_u2c.save(output_dir / 'val_utt2attack') - test_u2c.save(output_dir / 'test_utt2attack') + train_u2c.save(output_dir / "train_utt2attack") + val_u2c.save(output_dir / "val_utt2attack") + test_u2c.save(output_dir / "test_utt2attack") - train_wav.save(output_dir / 'train_wav.scp') - val_wav.save(output_dir / 'val_wav.scp') - trainval_wav.save(output_dir / 'trainval_wav.scp') - test_wav.save(output_dir / 'test_wav.scp') + train_wav.save(output_dir / "train_wav.scp") + val_wav.save(output_dir / "val_wav.scp") + trainval_wav.save(output_dir / "trainval_wav.scp") + test_wav.save(output_dir / "test_wav.scp") - train_u2d.save(output_dir / 'train_utt2dur') - val_u2d.save(output_dir / 'val_utt2dur') - trainval_u2d.save(output_dir / 'trainval_utt2dur') - test_u2d.save(output_dir / 'test_utt2dur') + train_u2d.save(output_dir / "train_utt2dur") + val_u2d.save(output_dir / "val_utt2dur") + trainval_u2d.save(output_dir / "trainval_utt2dur") + test_u2d.save(output_dir / "test_utt2dur") - with open(output_dir / 'class2int', 'w') as f: + with open(output_dir / "class2int", "w") as f: for c in uclasses: - f.write('%s\n' % (c)) - + f.write("%s\n" % (c)) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare lists to train nnet to discriminate between attacks types and benign speech') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--benign-durs', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare lists to train nnet to discriminate between attacks types and benign speech", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--benign-durs", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v1.py b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v1.py index f133333e..731ef93b 100755 --- a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v1.py +++ b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_threat_model_v1.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,46 +23,57 @@ from hyperion.utils import Utt2Info, SCPList -def make_lists(input_dir, output_dir, - train_min_snr, train_max_snr, train_success_category, - test_min_snr, test_max_snr, test_success_category): +def make_lists( + input_dir, + output_dir, + train_min_snr, + train_max_snr, + train_success_category, + test_min_snr, + test_max_snr, + test_success_category, +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'train_attack_info.yaml', 'r') as f: + with open(input_dir / "train_attack_info.yaml", "r") as f: train_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'val_attack_info.yaml', 'r') as f: + with open(input_dir / "val_attack_info.yaml", "r") as f: val_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] durs = [] - for k,v in train_attacks.items(): - s = v['success'] - if not (train_success_category == 'both' or - train_success_category == 'success' and s or - train_success_category == 'fail' and not s): + for k, v in train_attacks.items(): + s = v["success"] + if not ( + train_success_category == "both" + or train_success_category == "success" + and s + or train_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < train_min_snr or snr > train_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) train_u2c = Utt2Info.create(keys, classes) train_u2d = Utt2Info.create(keys, durs) @@ -70,24 +86,28 @@ def make_lists(input_dir, output_dir, files = [] classes = [] durs = [] - for k,v in val_attacks.items(): - s = v['success'] - if not (train_success_category == 'both' or - train_success_category == 'success' and s or - train_success_category == 'fail' and not s): + for k, v in val_attacks.items(): + s = v["success"] + if not ( + train_success_category == "both" + or train_success_category == "success" + and s + or train_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < train_min_snr or snr > train_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) val_u2c = Utt2Info.create(keys, classes) val_u2d = Utt2Info.create(keys, durs) @@ -100,24 +120,28 @@ def make_lists(input_dir, output_dir, classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (test_success_category == 'both' or - test_success_category == 'success' and s or - test_success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + test_success_category == "both" + or test_success_category == "success" + and s + or test_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < test_min_snr or snr > test_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['threat_model']) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["threat_model"]) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) test_u2c = Utt2Info.create(keys, classes) test_u2d = Utt2Info.create(keys, durs) @@ -128,45 +152,53 @@ def make_lists(input_dir, output_dir, trainval_u2d = Utt2Info.merge([train_u2d, val_u2d]) ##### - train_u2c.save(output_dir / 'train_utt2attack') - val_u2c.save(output_dir / 'val_utt2attack') - test_u2c.save(output_dir / 'test_utt2attack') + train_u2c.save(output_dir / "train_utt2attack") + val_u2c.save(output_dir / "val_utt2attack") + test_u2c.save(output_dir / "test_utt2attack") - train_wav.save(output_dir / 'train_wav.scp') - val_wav.save(output_dir / 'val_wav.scp') - trainval_wav.save(output_dir / 'trainval_wav.scp') - test_wav.save(output_dir / 'test_wav.scp') + train_wav.save(output_dir / "train_wav.scp") + val_wav.save(output_dir / "val_wav.scp") + trainval_wav.save(output_dir / "trainval_wav.scp") + test_wav.save(output_dir / "test_wav.scp") - train_u2d.save(output_dir / 'train_utt2dur') - val_u2d.save(output_dir / 'val_utt2dur') - trainval_u2d.save(output_dir / 'trainval_utt2dur') - test_u2d.save(output_dir / 'test_utt2dur') + train_u2d.save(output_dir / "train_utt2dur") + val_u2d.save(output_dir / "val_utt2dur") + trainval_u2d.save(output_dir / "trainval_utt2dur") + test_u2d.save(output_dir / "test_utt2dur") - with open(output_dir / 'class_file', 'w') as f: + with open(output_dir / "class_file", "w") as f: for c in uclasses: - f.write('%s\n' % (c)) - + f.write("%s\n" % (c)) + if __name__ == "__main__": parser = ArgumentParser( - description='prepare lists to train nnet to discriminate between attacks types and benign speech') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('--train-min-snr', default=-10, type=float) - parser.add_argument('--train-max-snr', default=100, type=float) - parser.add_argument('--train-success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--test-min-snr', default=-10, type=float) - parser.add_argument('--test-max-snr', default=100, type=float) - parser.add_argument('--test-success-category', default='success', - choices=['success', 'fail', 'both']) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare lists to train nnet to discriminate between attacks types and benign speech" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--train-min-snr", default=-10, type=float) + parser.add_argument("--train-max-snr", default=100, type=float) + parser.add_argument( + "--train-success-category", + default="success", + choices=["success", "fail", "both"], + ) + parser.add_argument("--test-min-snr", default=-10, type=float) + parser.add_argument("--test-max-snr", default=100, type=float) + parser.add_argument( + "--test-success-category", + default="success", + choices=["success", "fail", "both"], + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v0.py b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v0.py index 3aaae2f9..c2360ab9 100755 --- a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v0.py +++ b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v0.py @@ -25,34 +25,34 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'train_attack_info.yml', 'r') as f: + with open(input_dir / "train_attack_info.yml", "r") as f: train_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'val_attack_info.yml', 'r') as f: + with open(input_dir / "val_attack_info.yml", "r") as f: val_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) u2d = Utt2Info.load(benign_durs) - + keys = [] files = [] classes = [] benign_keys = [] durs = [] - for k,v in train_attacks.items(): + for k, v in train_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - benign_keys.append(v['key_benign']) - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + benign_keys.append(v["key_benign"]) + durs.append(v["num_samples"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -68,17 +68,17 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in val_attacks.items(): + for k, v in val_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - benign_keys.append(v['key_benign']) - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + benign_keys.append(v["key_benign"]) + durs.append(v["num_samples"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -93,17 +93,17 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - benign_keys.append(v['key']) - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + benign_keys.append(v["key"]) + durs.append(v["num_samples"] / 16000) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) durs.append(u2d[k]) @@ -116,40 +116,42 @@ def make_lists(input_dir, benign_wav_file, benign_durs, output_dir): trainval_u2d = Utt2Info.merge([train_u2d, val_u2d]) ##### - train_u2c.save(output_dir / 'train_utt2attack') - val_u2c.save(output_dir / 'val_utt2attack') - test_u2c.save(output_dir / 'test_utt2attack') + train_u2c.save(output_dir / "train_utt2attack") + val_u2c.save(output_dir / "val_utt2attack") + test_u2c.save(output_dir / "test_utt2attack") - train_wav.save(output_dir / 'train_wav.scp') - val_wav.save(output_dir / 'val_wav.scp') - trainval_wav.save(output_dir / 'trainval_wav.scp') - test_wav.save(output_dir / 'test_wav.scp') + train_wav.save(output_dir / "train_wav.scp") + val_wav.save(output_dir / "val_wav.scp") + trainval_wav.save(output_dir / "trainval_wav.scp") + test_wav.save(output_dir / "test_wav.scp") - train_u2d.save(output_dir / 'train_utt2dur') - val_u2d.save(output_dir / 'val_utt2dur') - trainval_u2d.save(output_dir / 'trainval_utt2dur') - test_u2d.save(output_dir / 'test_utt2dur') + train_u2d.save(output_dir / "train_utt2dur") + val_u2d.save(output_dir / "val_utt2dur") + trainval_u2d.save(output_dir / "trainval_utt2dur") + test_u2d.save(output_dir / "test_utt2dur") - with open(output_dir / 'class2int', 'w') as f: + with open(output_dir / "class2int", "w") as f: for c in uclasses: - f.write('%s\n' % (c)) - + f.write("%s\n" % (c)) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare lists to train nnet to discriminate between attacks types and benign speech') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--benign-durs', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare lists to train nnet to discriminate between attacks types and benign speech", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--benign-durs", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v1.py b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v1.py index 893b948b..9938c81d 100755 --- a/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v1.py +++ b/egs/voxceleb/adv.v2/local/make_train_test_lists_exp_attack_type_v1.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,46 +23,56 @@ from hyperion.utils import Utt2Info, SCPList -def make_lists(input_dir, output_dir, - train_min_snr, train_max_snr, train_success_category, - test_min_snr, test_max_snr, test_success_category, - ): +def make_lists( + input_dir, + output_dir, + train_min_snr, + train_max_snr, + train_success_category, + test_min_snr, + test_max_snr, + test_success_category, +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'train_attack_info.yaml', 'r') as f: + with open(input_dir / "train_attack_info.yaml", "r") as f: train_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'val_attack_info.yaml', 'r') as f: + with open(input_dir / "val_attack_info.yaml", "r") as f: val_attacks = yaml.load(f, Loader=yaml.FullLoader) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] durs = [] - for k,v in train_attacks.items(): - s = v['success'] - if not (train_success_category == 'both' or - train_success_category == 'success' and s or - train_success_category == 'fail' and not s): + for k, v in train_attacks.items(): + s = v["success"] + if not ( + train_success_category == "both" + or train_success_category == "success" + and s + or train_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < train_min_snr or snr > train_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) train_u2c = Utt2Info.create(keys, classes) train_u2d = Utt2Info.create(keys, durs) @@ -70,24 +85,28 @@ def make_lists(input_dir, output_dir, files = [] classes = [] durs = [] - for k,v in val_attacks.items(): - s = v['success'] - if not (train_success_category == 'both' or - train_success_category == 'success' and s or - train_success_category == 'fail' and not s): + for k, v in val_attacks.items(): + s = v["success"] + if not ( + train_success_category == "both" + or train_success_category == "success" + and s + or train_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < train_min_snr or snr > train_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) val_u2c = Utt2Info.create(keys, classes) val_u2d = Utt2Info.create(keys, durs) @@ -99,24 +118,28 @@ def make_lists(input_dir, output_dir, files = [] classes = [] durs = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (test_success_category == 'both' or - test_success_category == 'success' and s or - test_success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + test_success_category == "both" + or test_success_category == "success" + and s + or test_success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < test_min_snr or snr > test_max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - durs.append(v['num_samples']/16000) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - durs.append(v['num_samples']/16000) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + durs.append(v["num_samples"] / 16000) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + durs.append(v["num_samples"] / 16000) test_u2c = Utt2Info.create(keys, classes) test_u2d = Utt2Info.create(keys, durs) @@ -127,45 +150,53 @@ def make_lists(input_dir, output_dir, trainval_u2d = Utt2Info.merge([train_u2d, val_u2d]) ##### - train_u2c.save(output_dir / 'train_utt2attack') - val_u2c.save(output_dir / 'val_utt2attack') - test_u2c.save(output_dir / 'test_utt2attack') + train_u2c.save(output_dir / "train_utt2attack") + val_u2c.save(output_dir / "val_utt2attack") + test_u2c.save(output_dir / "test_utt2attack") - train_wav.save(output_dir / 'train_wav.scp') - val_wav.save(output_dir / 'val_wav.scp') - trainval_wav.save(output_dir / 'trainval_wav.scp') - test_wav.save(output_dir / 'test_wav.scp') + train_wav.save(output_dir / "train_wav.scp") + val_wav.save(output_dir / "val_wav.scp") + trainval_wav.save(output_dir / "trainval_wav.scp") + test_wav.save(output_dir / "test_wav.scp") - train_u2d.save(output_dir / 'train_utt2dur') - val_u2d.save(output_dir / 'val_utt2dur') - trainval_u2d.save(output_dir / 'trainval_utt2dur') - test_u2d.save(output_dir / 'test_utt2dur') + train_u2d.save(output_dir / "train_utt2dur") + val_u2d.save(output_dir / "val_utt2dur") + trainval_u2d.save(output_dir / "trainval_utt2dur") + test_u2d.save(output_dir / "test_utt2dur") - with open(output_dir / 'class_file', 'w') as f: + with open(output_dir / "class_file", "w") as f: for c in uclasses: - f.write('%s\n' % (c)) - + f.write("%s\n" % (c)) + if __name__ == "__main__": parser = ArgumentParser( - description='prepare lists to train nnet to discriminate between attacks types and benign speech') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('--train-min-snr', default=-10, type=float) - parser.add_argument('--train-max-snr', default=100, type=float) - parser.add_argument('--train-success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--test-min-snr', default=-10, type=float) - parser.add_argument('--test-max-snr', default=100, type=float) - parser.add_argument('--test-success-category', default='success', - choices=['success', 'fail', 'both']) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare lists to train nnet to discriminate between attacks types and benign speech" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument("--train-min-snr", default=-10, type=float) + parser.add_argument("--train-max-snr", default=100, type=float) + parser.add_argument( + "--train-success-category", + default="success", + choices=["success", "fail", "both"], + ) + parser.add_argument("--test-min-snr", default=-10, type=float) + parser.add_argument("--test-max-snr", default=100, type=float) + parser.add_argument( + "--test-success-category", + default="success", + choices=["success", "fail", "both"], + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v0.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v0.py index d5f6ba01..01a4d144 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v0.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v0.py @@ -17,22 +17,25 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import Utt2Info, SCPList, TrialKey -snr_levels = np.arange(0,65,10) +snr_levels = np.arange(0, 65, 10) + def quant_snr(snr): - q = np.argmin((snr_levels - snr)**2) - q_str = 'snr-%d' % (int(snr_levels[q])) + q = np.argmin((snr_levels - snr) ** 2) + q_str = "snr-%d" % (int(snr_levels[q])) return q_str -def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_sides, output_dir): +def make_lists( + input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_sides, output_dir +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) @@ -41,18 +44,18 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ classes = [] benign_keys = [] snrs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - snrs.append(quant_snr(v['snr'])) - benign_keys.append(v['benign_key']) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + snrs.append(quant_snr(v["snr"])) + benign_keys.append(v["benign_key"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') - snrs.append('benign') + classes.append("benign") + snrs.append("benign") files.append(k2w[k][0]) u2c = Utt2Info.create(keys, classes) @@ -60,34 +63,32 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - mask = rng.rand(len(u2c)) > 1/(1 + num_enroll_sides) + mask = rng.rand(len(u2c)) > 1 / (1 + num_enroll_sides) enr_key = u2c.key[mask] - test_key = u2c.key[mask==False] + test_key = u2c.key[mask == False] enr_u2c = u2c.filter(enr_key) test_u2c = u2c.filter(test_key) enr_u2snr = u2snr.filter(enr_key) test_u2snr = u2snr.filter(test_key) if num_enroll_sides > 1: - class_uniq, class_ids = np.unique( - test_u2c.info, return_inverse=True) - snr_uniq, snr_ids = np.unique( - test_u2snr.info, return_inverse=True) + class_uniq, class_ids = np.unique(test_u2c.info, return_inverse=True) + snr_uniq, snr_ids = np.unique(test_u2snr.info, return_inverse=True) num_classes = len(class_uniq) * len(snr_uniq) count_sides = np.zeros((num_classes,), dtype=np.int) count_models = np.zeros((num_classes,), dtype=np.int) enroll_models = [] for i in range(len(test_u2c)): - a = class_ids[i] + a = class_ids[i] b = snr_ids[i] j = a * len(snr_uniq) + b side = count_sides[j] % num_enroll_sides if side == 0: count_models[j] += 1 - enroll_model = '%s-%s-%03d' % (class_uniq[a], snr_uniq[b], count_models[j]) + enroll_model = "%s-%s-%03d" % (class_uniq[a], snr_uniq[b], count_models[j]) enroll_models.append(enroll_model) count_sides[j] += 1 @@ -106,25 +107,31 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ for i in range(len(enr_e2snr)): for j in range(len(test_u2c)): if enr_e2snr.info[i] == test_u2snr.info[j]: - trials_all.tar[i,j] = True + trials_all.tar[i, j] = True if enr_e2c.info[i] in seen_attacks and test_u2c.info[j] in seen_attacks: - trials_seen.tar[i,j] = True - elif enr_e2c.info[i] not in seen_attacks and test_u2c.info[j] not in seen_attacks: - trials_unseen.tar[i,j] = True + trials_seen.tar[i, j] = True + elif ( + enr_e2c.info[i] not in seen_attacks + and test_u2c.info[j] not in seen_attacks + ): + trials_unseen.tar[i, j] = True else: - trials_all.non[i,j] = True + trials_all.non[i, j] = True if enr_e2c.info[i] in seen_attacks and test_u2c.info[j] in seen_attacks: - trials_seen.non[i,j] = True - elif enr_e2c.info[i] not in seen_attacks and test_u2c.info[j] not in seen_attacks: - trials_unseen.non[i,j] = True + trials_seen.non[i, j] = True + elif ( + enr_e2c.info[i] not in seen_attacks + and test_u2c.info[j] not in seen_attacks + ): + trials_unseen.non[i, j] = True max_trials = int(max_trials * 1e6) - num_tar_trials = np.sum(trials_all.tar) + num_tar_trials = np.sum(trials_all.tar) num_non_trials = np.sum(trials_all.non) num_trials = num_tar_trials + num_non_trials if num_trials > max_trials: p = max_trials / num_trials - logging.info('reducing number of trials (%d) with p=%f' % (num_trials, p)) + logging.info("reducing number of trials (%d) with p=%f" % (num_trials, p)) mask = rng.rand(*trials_all.tar.shape) > p trials_all.non[mask] = False trials_seen.non[mask] = False @@ -134,10 +141,10 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ trials_unseen.tar[mask] = False enr_u2e.sort(1) - enr_u2e.save(output_dir / 'utt2enr') - trials_all.save_txt(output_dir / 'trials') - trials_seen.save_txt(output_dir / 'trials_seen') - trials_unseen.save_txt(output_dir / 'trials_unseen') + enr_u2e.save(output_dir / "utt2enr") + trials_all.save_txt(output_dir / "trials") + trials_seen.save_txt(output_dir / "trials_seen") + trials_unseen.save_txt(output_dir / "trials_unseen") # train_u2d.save(output_dir / 'train_utt2dur') # val_u2d.save(output_dir / 'val_utt2dur') @@ -147,25 +154,27 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ # with open(output_dir / 'class2int', 'w') as f: # for c in uclasses: # f.write('%s\n' % (c)) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare trial list to do attack type verification') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--seen-attacks', required=True, nargs='+') - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--num-enroll-sides', default=1, type=int) - parser.add_argument('--max-trials', default=10, type=float) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare trial list to do attack type verification", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--seen-attacks", required=True, nargs="+") + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--num-enroll-sides", default=1, type=int) + parser.add_argument("--max-trials", default=10, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v2.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v2.py index 7204a828..2cc45d18 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v2.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_snr_verif_v2.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -17,83 +22,92 @@ from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import Utt2Info, SCPList, TrialKey -snr_levels = np.arange(0,65,10) +snr_levels = np.arange(0, 65, 10) + def quant_snr(snr): - q = np.argmin((snr_levels - snr)**2) - q_str = 'snr-%d' % (int(snr_levels[q])) + q = np.argmin((snr_levels - snr) ** 2) + q_str = "snr-%d" % (int(snr_levels[q])) return q_str -def make_lists(input_dir, known_attacks, output_dir, - min_snr, max_snr, success_category, - max_trials, num_enroll_sides): +def make_lists( + input_dir, + known_attacks, + output_dir, + min_snr, + max_snr, + success_category, + max_trials, + num_enroll_sides, +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] snrs = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (success_category == 'both' or - success_category == 'success' and s or - success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + success_category == "both" + or success_category == "success" + and s + or success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < min_snr or snr > max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - snrs.append(quant_snr(v['snr'])) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - snrs.append('benign') - + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + snrs.append(quant_snr(v["snr"])) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + snrs.append("benign") u2c = Utt2Info.create(keys, classes) u2snr = Utt2Info.create(keys, snrs) wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - mask = rng.rand(len(u2c)) > 1/(1 + num_enroll_sides) + mask = rng.rand(len(u2c)) > 1 / (1 + num_enroll_sides) enr_key = u2c.key[mask] - test_key = u2c.key[mask==False] + test_key = u2c.key[mask == False] enr_u2c = u2c.filter(enr_key) test_u2c = u2c.filter(test_key) enr_u2snr = u2snr.filter(enr_key) test_u2snr = u2snr.filter(test_key) if num_enroll_sides > 1: - class_uniq, class_ids = np.unique( - test_u2c.info, return_inverse=True) - snr_uniq, snr_ids = np.unique( - test_u2snr.info, return_inverse=True) + class_uniq, class_ids = np.unique(test_u2c.info, return_inverse=True) + snr_uniq, snr_ids = np.unique(test_u2snr.info, return_inverse=True) num_classes = len(class_uniq) * len(snr_uniq) count_sides = np.zeros((num_classes,), dtype=np.int) count_models = np.zeros((num_classes,), dtype=np.int) enroll_models = [] for i in range(len(test_u2c)): - a = class_ids[i] + a = class_ids[i] b = snr_ids[i] j = a * len(snr_uniq) + b side = count_sides[j] % num_enroll_sides if side == 0: count_models[j] += 1 - enroll_model = '%s-%s-%03d' % (class_uniq[a], snr_uniq[b], count_models[j]) + enroll_model = "%s-%s-%03d" % (class_uniq[a], snr_uniq[b], count_models[j]) enroll_models.append(enroll_model) count_sides[j] += 1 @@ -112,25 +126,37 @@ def make_lists(input_dir, known_attacks, output_dir, for i in range(len(enr_e2snr)): for j in range(len(test_u2c)): if enr_e2snr.info[i] == test_u2snr.info[j]: - trials_all.tar[i,j] = True - if enr_e2c.info[i] in known_attacks and test_u2c.info[j] in known_attacks: - trials_known.tar[i,j] = True - elif enr_e2c.info[i] not in known_attacks and test_u2c.info[j] not in known_attacks: - trials_unknown.tar[i,j] = True + trials_all.tar[i, j] = True + if ( + enr_e2c.info[i] in known_attacks + and test_u2c.info[j] in known_attacks + ): + trials_known.tar[i, j] = True + elif ( + enr_e2c.info[i] not in known_attacks + and test_u2c.info[j] not in known_attacks + ): + trials_unknown.tar[i, j] = True else: - trials_all.non[i,j] = True - if enr_e2c.info[i] in known_attacks and test_u2c.info[j] in known_attacks: - trials_known.non[i,j] = True - elif enr_e2c.info[i] not in known_attacks and test_u2c.info[j] not in known_attacks: - trials_unknown.non[i,j] = True + trials_all.non[i, j] = True + if ( + enr_e2c.info[i] in known_attacks + and test_u2c.info[j] in known_attacks + ): + trials_known.non[i, j] = True + elif ( + enr_e2c.info[i] not in known_attacks + and test_u2c.info[j] not in known_attacks + ): + trials_unknown.non[i, j] = True max_trials = int(max_trials * 1e6) - num_tar_trials = np.sum(trials_all.tar) + num_tar_trials = np.sum(trials_all.tar) num_non_trials = np.sum(trials_all.non) num_trials = num_tar_trials + num_non_trials if num_trials > max_trials: p = max_trials / num_trials - logging.info('reducing number of trials (%d) with p=%f' % (num_trials, p)) + logging.info("reducing number of trials (%d) with p=%f" % (num_trials, p)) mask = rng.rand(*trials_all.tar.shape) > p trials_all.non[mask] = False trials_known.non[mask] = False @@ -140,10 +166,10 @@ def make_lists(input_dir, known_attacks, output_dir, trials_unknown.tar[mask] = False enr_u2e.sort(1) - enr_u2e.save(output_dir / 'utt2enr') - trials_all.save_txt(output_dir / 'trials') - trials_known.save_txt(output_dir / 'trials_known') - trials_unknown.save_txt(output_dir / 'trials_unknown') + enr_u2e.save(output_dir / "utt2enr") + trials_all.save_txt(output_dir / "trials") + trials_known.save_txt(output_dir / "trials_known") + trials_unknown.save_txt(output_dir / "trials_unknown") # train_u2d.save(output_dir / 'train_utt2dur') # val_u2d.save(output_dir / 'val_utt2dur') @@ -153,26 +179,29 @@ def make_lists(input_dir, known_attacks, output_dir, # with open(output_dir / 'class2int', 'w') as f: # for c in uclasses: # f.write('%s\n' % (c)) - + if __name__ == "__main__": parser = ArgumentParser( - description='prepare trial list to do attack type verification') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--known-attacks', required=True, nargs='+') - parser.add_argument('--num-enroll-sides', default=1, type=int) - parser.add_argument('--max-trials', default=10, type=float) - parser.add_argument('--success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--min-snr', default=-10, type=float) - parser.add_argument('--max-snr', default=100, type=float) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare trial list to do attack type verification" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--known-attacks", required=True, nargs="+") + parser.add_argument("--num-enroll-sides", default=1, type=int) + parser.add_argument("--max-trials", default=10, type=float) + parser.add_argument( + "--success-category", default="success", choices=["success", "fail", "both"] + ) + parser.add_argument("--min-snr", default=-10, type=float) + parser.add_argument("--max-snr", default=100, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v0.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v0.py index 3c67e3bd..48b75682 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v0.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v0.py @@ -18,15 +18,16 @@ from hyperion.utils import Utt2Info, SCPList, TrialKey - -def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_sides, output_dir): +def make_lists( + input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_sides, output_dir +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) @@ -35,18 +36,18 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ classes = [] benign_keys = [] tms = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - tms.append(v['threat_model']) - benign_keys.append(v['benign_key']) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + tms.append(v["threat_model"]) + benign_keys.append(v["benign_key"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') - tms.append('benign') + classes.append("benign") + tms.append("benign") files.append(k2w[k][0]) u2c = Utt2Info.create(keys, classes) @@ -54,34 +55,32 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - mask = rng.rand(len(u2c)) > 1/(num_enroll_sides + 1) + mask = rng.rand(len(u2c)) > 1 / (num_enroll_sides + 1) enr_key = u2c.key[mask] - test_key = u2c.key[mask==False] + test_key = u2c.key[mask == False] enr_u2c = u2c.filter(enr_key) test_u2c = u2c.filter(test_key) enr_u2tm = u2tm.filter(enr_key) test_u2tm = u2tm.filter(test_key) if num_enroll_sides > 1: - class_uniq, class_ids = np.unique( - test_u2c.info, return_inverse=True) - tm_uniq, tm_ids = np.unique( - test_u2tm.info, return_inverse=True) + class_uniq, class_ids = np.unique(test_u2c.info, return_inverse=True) + tm_uniq, tm_ids = np.unique(test_u2tm.info, return_inverse=True) num_classes = len(class_uniq) * len(tm_uniq) count_sides = np.zeros((num_classes,), dtype=np.int) count_models = np.zeros((num_classes,), dtype=np.int) enroll_models = [] for i in range(len(test_u2c)): - a = class_ids[i] + a = class_ids[i] b = tm_ids[i] j = a * len(tm_uniq) + b side = count_sides[j] % num_enroll_sides if side == 0: count_models[j] += 1 - enroll_model = '%s-%s-%03d' % (class_uniq[a], tm_uniq[b], count_models[j]) + enroll_model = "%s-%s-%03d" % (class_uniq[a], tm_uniq[b], count_models[j]) enroll_models.append(enroll_model) count_sides[j] += 1 @@ -100,25 +99,31 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ for i in range(len(enr_e2tm)): for j in range(len(test_u2c)): if enr_e2tm.info[i] == test_u2tm.info[j]: - trials_all.tar[i,j] = True + trials_all.tar[i, j] = True if enr_e2c.info[i] in seen_attacks and test_u2c.info[j] in seen_attacks: - trials_seen.tar[i,j] = True - elif enr_e2c.info[i] not in seen_attacks and test_u2c.info[j] not in seen_attacks: - trials_unseen.tar[i,j] = True + trials_seen.tar[i, j] = True + elif ( + enr_e2c.info[i] not in seen_attacks + and test_u2c.info[j] not in seen_attacks + ): + trials_unseen.tar[i, j] = True else: - trials_all.non[i,j] = True + trials_all.non[i, j] = True if enr_e2c.info[i] in seen_attacks and test_u2c.info[j] in seen_attacks: - trials_seen.non[i,j] = True - elif enr_e2c.info[i] not in seen_attacks and test_u2c.info[j] not in seen_attacks: - trials_unseen.non[i,j] = True + trials_seen.non[i, j] = True + elif ( + enr_e2c.info[i] not in seen_attacks + and test_u2c.info[j] not in seen_attacks + ): + trials_unseen.non[i, j] = True max_trials = int(max_trials * 1e6) - num_tar_trials = np.sum(trials_all.tar) + num_tar_trials = np.sum(trials_all.tar) num_non_trials = np.sum(trials_all.non) num_trials = num_tar_trials + num_non_trials if num_trials > max_trials: p = max_trials / num_trials - logging.info('reducing number of trials (%d) with p=%f' % (num_trials, p)) + logging.info("reducing number of trials (%d) with p=%f" % (num_trials, p)) mask = rng.rand(*trials_all.tar.shape) > p trials_all.non[mask] = False trials_seen.non[mask] = False @@ -128,10 +133,10 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ trials_unseen.tar[mask] = False enr_u2e.sort(1) - enr_u2e.save(output_dir / 'utt2enr') - trials_all.save_txt(output_dir / 'trials') - trials_seen.save_txt(output_dir / 'trials_seen') - trials_unseen.save_txt(output_dir / 'trials_unseen') + enr_u2e.save(output_dir / "utt2enr") + trials_all.save_txt(output_dir / "trials") + trials_seen.save_txt(output_dir / "trials_seen") + trials_unseen.save_txt(output_dir / "trials_unseen") # train_u2d.save(output_dir / 'train_utt2dur') # val_u2d.save(output_dir / 'val_utt2dur') @@ -141,29 +146,29 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, max_trials, num_enroll_ # with open(output_dir / 'class2int', 'w') as f: # for c in uclasses: # f.write('%s\n' % (c)) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare trial list to do attack threat model verification') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--seen-attacks', required=True, nargs='+') - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--num-enroll-sides', default=1, type=int) - parser.add_argument('--max-trials', default=10, type=float) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare trial list to do attack threat model verification", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--seen-attacks", required=True, nargs="+") + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--num-enroll-sides", default=1, type=int) + parser.add_argument("--max-trials", default=10, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) make_lists(**vars(args)) - - diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v2.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v2.py index 42458569..92d18095 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v2.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_threat_model_verif_v2.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,75 +23,83 @@ from hyperion.utils import Utt2Info, SCPList, TrialKey - -def make_lists(input_dir, known_attacks, output_dir, - min_snr, max_snr, success_category, - max_trials, num_enroll_sides): +def make_lists( + input_dir, + known_attacks, + output_dir, + min_snr, + max_snr, + success_category, + max_trials, + num_enroll_sides, +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] tms = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (success_category == 'both' or - success_category == 'success' and s or - success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + success_category == "both" + or success_category == "success" + and s + or success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < min_snr or snr > max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - tms.append(v['threat_model']) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - tms.append('benign') + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + tms.append(v["threat_model"]) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") + tms.append("benign") u2c = Utt2Info.create(keys, classes) u2tm = Utt2Info.create(keys, tms) wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - mask = rng.rand(len(u2c)) > 1/(num_enroll_sides + 1) + mask = rng.rand(len(u2c)) > 1 / (num_enroll_sides + 1) enr_key = u2c.key[mask] - test_key = u2c.key[mask==False] + test_key = u2c.key[mask == False] enr_u2c = u2c.filter(enr_key) test_u2c = u2c.filter(test_key) enr_u2tm = u2tm.filter(enr_key) test_u2tm = u2tm.filter(test_key) if num_enroll_sides > 1: - class_uniq, class_ids = np.unique( - test_u2c.info, return_inverse=True) - tm_uniq, tm_ids = np.unique( - test_u2tm.info, return_inverse=True) + class_uniq, class_ids = np.unique(test_u2c.info, return_inverse=True) + tm_uniq, tm_ids = np.unique(test_u2tm.info, return_inverse=True) num_classes = len(class_uniq) * len(tm_uniq) count_sides = np.zeros((num_classes,), dtype=np.int) count_models = np.zeros((num_classes,), dtype=np.int) enroll_models = [] for i in range(len(test_u2c)): - a = class_ids[i] + a = class_ids[i] b = tm_ids[i] j = a * len(tm_uniq) + b side = count_sides[j] % num_enroll_sides if side == 0: count_models[j] += 1 - enroll_model = '%s-%s-%03d' % (class_uniq[a], tm_uniq[b], count_models[j]) + enroll_model = "%s-%s-%03d" % (class_uniq[a], tm_uniq[b], count_models[j]) enroll_models.append(enroll_model) count_sides[j] += 1 @@ -105,25 +118,37 @@ def make_lists(input_dir, known_attacks, output_dir, for i in range(len(enr_e2tm)): for j in range(len(test_u2c)): if enr_e2tm.info[i] == test_u2tm.info[j]: - trials_all.tar[i,j] = True - if enr_e2c.info[i] in known_attacks and test_u2c.info[j] in known_attacks: - trials_known.tar[i,j] = True - elif enr_e2c.info[i] not in known_attacks and test_u2c.info[j] not in known_attacks: - trials_unknown.tar[i,j] = True + trials_all.tar[i, j] = True + if ( + enr_e2c.info[i] in known_attacks + and test_u2c.info[j] in known_attacks + ): + trials_known.tar[i, j] = True + elif ( + enr_e2c.info[i] not in known_attacks + and test_u2c.info[j] not in known_attacks + ): + trials_unknown.tar[i, j] = True else: - trials_all.non[i,j] = True - if enr_e2c.info[i] in known_attacks and test_u2c.info[j] in known_attacks: - trials_known.non[i,j] = True - elif enr_e2c.info[i] not in known_attacks and test_u2c.info[j] not in known_attacks: - trials_unknown.non[i,j] = True + trials_all.non[i, j] = True + if ( + enr_e2c.info[i] in known_attacks + and test_u2c.info[j] in known_attacks + ): + trials_known.non[i, j] = True + elif ( + enr_e2c.info[i] not in known_attacks + and test_u2c.info[j] not in known_attacks + ): + trials_unknown.non[i, j] = True max_trials = int(max_trials * 1e6) - num_tar_trials = np.sum(trials_all.tar) + num_tar_trials = np.sum(trials_all.tar) num_non_trials = np.sum(trials_all.non) num_trials = num_tar_trials + num_non_trials if num_trials > max_trials: p = max_trials / num_trials - logging.info('reducing number of trials (%d) with p=%f' % (num_trials, p)) + logging.info("reducing number of trials (%d) with p=%f" % (num_trials, p)) mask = rng.rand(*trials_all.tar.shape) > p trials_all.non[mask] = False trials_known.non[mask] = False @@ -133,34 +158,35 @@ def make_lists(input_dir, known_attacks, output_dir, trials_unknown.tar[mask] = False enr_u2e.sort(1) - enr_u2e.save(output_dir / 'utt2enr') - trials_all.save_txt(output_dir / 'trials') - trials_known.save_txt(output_dir / 'trials_known') - trials_unknown.save_txt(output_dir / 'trials_unknown') + enr_u2e.save(output_dir / "utt2enr") + trials_all.save_txt(output_dir / "trials") + trials_known.save_txt(output_dir / "trials_known") + trials_unknown.save_txt(output_dir / "trials_unknown") if __name__ == "__main__": parser = ArgumentParser( - description='prepare trial list to do attack threat model verification') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--known-attacks', required=True, nargs='+') - parser.add_argument('--num-enroll-sides', default=1, type=int) - parser.add_argument('--max-trials', default=10, type=float) - parser.add_argument('--success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--min-snr', default=-10, type=float) - parser.add_argument('--max-snr', default=100, type=float) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare trial list to do attack threat model verification" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--known-attacks", required=True, nargs="+") + parser.add_argument("--num-enroll-sides", default=1, type=int) + parser.add_argument("--max-trials", default=10, type=float) + parser.add_argument( + "--success-category", default="success", choices=["success", "fail", "both"] + ) + parser.add_argument("--min-snr", default=-10, type=float) + parser.add_argument("--max-snr", default=100, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) make_lists(**vars(args)) - - diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v0.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v0.py index 41ab8fac..8bcd1472 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v0.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v0.py @@ -25,7 +25,7 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir): output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) @@ -35,62 +35,64 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir): classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - benign_keys.append(v['benign_key']) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + benign_keys.append(v["benign_key"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) u2c = Utt2Info.create(keys, classes) - #test_u2d = Utt2Info.create(keys, durs) + # test_u2d = Utt2Info.create(keys, durs) wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - with open(output_dir / 'trials', 'w') as f: + with open(output_dir / "trials", "w") as f: for i in range(len(u2c)): k = u2c.key[i] att = u2c.info[i] if att in seen_attacks: - f.write('seen %s nontarget\n' % k) + f.write("seen %s nontarget\n" % k) else: - f.write('seen %s target\n' % k) + f.write("seen %s target\n" % k) - with open(output_dir / 'trials_nobenign', 'w') as f: + with open(output_dir / "trials_nobenign", "w") as f: for i in range(len(u2c)): k = u2c.key[i] att = u2c.info[i] - if att in ['benign']: + if att in ["benign"]: continue if att in seen_attacks: - f.write('seen %s nontarget\n' % k) + f.write("seen %s nontarget\n" % k) else: - f.write('seen %s target\n' % k) - + f.write("seen %s target\n" % k) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare trial list to do attack type novelty det') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--seen-attacks', required=True, nargs='+') - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare trial list to do attack type novelty det", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--seen-attacks", required=True, nargs="+") + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v2.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v2.py index 63984704..32fdb1d0 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v2.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_novelty_v2.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,84 +23,90 @@ from hyperion.utils import Utt2Info, SCPList, TrialKey -def make_lists(input_dir, known_attacks, - min_snr, max_snr, success_category, - output_dir): +def make_lists( + input_dir, known_attacks, min_snr, max_snr, success_category, output_dir +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (success_category == 'both' or - success_category == 'success' and s or - success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + success_category == "both" + or success_category == "success" + and s + or success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < min_snr or snr > max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') - + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") u2c = Utt2Info.create(keys, classes) wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - with open(output_dir / 'trials', 'w') as f: + with open(output_dir / "trials", "w") as f: for i in range(len(u2c)): k = u2c.key[i] att = u2c.info[i] if att in known_attacks: - f.write('known %s nontarget\n' % k) + f.write("known %s nontarget\n" % k) else: - f.write('known %s target\n' % k) + f.write("known %s target\n" % k) - with open(output_dir / 'trials_nobenign', 'w') as f: + with open(output_dir / "trials_nobenign", "w") as f: for i in range(len(u2c)): k = u2c.key[i] att = u2c.info[i] - if att in ['benign']: + if att in ["benign"]: continue if att in known_attacks: - f.write('known %s nontarget\n' % k) + f.write("known %s nontarget\n" % k) else: - f.write('known %s target\n' % k) - + f.write("known %s target\n" % k) + if __name__ == "__main__": parser = ArgumentParser( - description='prepare trial list to do attack type novelty det') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--known-attacks', required=True, nargs='+') - #parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--min-snr', default=-10, type=float) - parser.add_argument('--max-snr', default=100, type=float) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare trial list to do attack type novelty det" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--known-attacks", required=True, nargs="+") + # parser.add_argument('--benign-wav-file', required=True) + parser.add_argument( + "--success-category", default="success", choices=["success", "fail", "both"] + ) + parser.add_argument("--min-snr", default=-10, type=float) + parser.add_argument("--max-snr", default=100, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v0.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v0.py index a57d1dd6..4ce2b925 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v0.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v0.py @@ -18,19 +18,21 @@ from hyperion.utils import Utt2Info, SCPList, TrialKey -def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, num_enroll_sides): +def make_lists( + input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, num_enroll_sides +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yml', 'r') as f: + with open(input_dir / "test_attack_info.yml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) k2w = SCPList.load(benign_wav_file) - #u2d = Utt2Info.load(benign_durs) - + # u2d = Utt2Info.load(benign_durs) + # keys = [] # files = [] # classes = [] @@ -87,29 +89,29 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, classes = [] benign_keys = [] durs = [] - for k,v in test_attacks.items(): + for k, v in test_attacks.items(): keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - benign_keys.append(v['benign_key']) + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + benign_keys.append(v["benign_key"]) benign_keys = np.unique(benign_keys) for k in benign_keys: keys.append(k) - classes.append('benign') + classes.append("benign") files.append(k2w[k][0]) u2c = Utt2Info.create(keys, classes) - #test_u2d = Utt2Info.create(keys, durs) + # test_u2d = Utt2Info.create(keys, durs) wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - mask = rng.rand(len(u2c)) > 1/(num_enroll_sides + 1) + mask = rng.rand(len(u2c)) > 1 / (num_enroll_sides + 1) enr_key = u2c.key[mask] - test_key = u2c.key[mask==False] + test_key = u2c.key[mask == False] enr_u2c = u2c.filter(enr_key) test_u2c = u2c.filter(test_key) @@ -124,7 +126,7 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, side = count_sides[j] % num_enroll_sides if side == 0: count_models[j] += 1 - enroll_model = '%s-%03d' % (class_uniq[j], count_models[j]) + enroll_model = "%s-%03d" % (class_uniq[j], count_models[j]) enroll_models.append(enroll_model) count_sides[j] += 1 @@ -134,33 +136,35 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, else: enr_e2c = enr_u2c enr_u2e = Utt2Info.create(enr_u2c.key, enr_u2c.key) - + trials_all = TrialKey(enr_e2c.key, test_u2c.key) trials_seen = TrialKey(enr_e2c.key, test_u2c.key) trials_unseen = TrialKey(enr_e2c.key, test_u2c.key) for i in range(len(enr_e2c)): for j in range(len(test_u2c)): if enr_e2c.info[i] == test_u2c.info[j]: - trials_all.tar[i,j] = True + trials_all.tar[i, j] = True if enr_e2c.info[i] in seen_attacks: - trials_seen.tar[i,j] = True + trials_seen.tar[i, j] = True else: - trials_unseen.tar[i,j] = True + trials_unseen.tar[i, j] = True else: - trials_all.non[i,j] = True + trials_all.non[i, j] = True if enr_e2c.info[i] in seen_attacks and test_u2c.info[j] in seen_attacks: - trials_seen.non[i,j] = True - elif enr_e2c.info[i] not in seen_attacks and test_u2c.info[j] not in seen_attacks: - trials_unseen.non[i,j] = True - + trials_seen.non[i, j] = True + elif ( + enr_e2c.info[i] not in seen_attacks + and test_u2c.info[j] not in seen_attacks + ): + trials_unseen.non[i, j] = True max_trials = int(max_trials * 1e6) - num_tar_trials = np.sum(trials_all.tar) + num_tar_trials = np.sum(trials_all.tar) num_non_trials = np.sum(trials_all.non) num_trials = num_tar_trials + num_non_trials if num_trials > max_trials: p = max_trials / num_trials - logging.info('reducing number of trials (%d) with p=%f' % (num_trials, p)) + logging.info("reducing number of trials (%d) with p=%f" % (num_trials, p)) mask = rng.rand(*trials_all.tar.shape) > p trials_all.non[mask] = False trials_seen.non[mask] = False @@ -170,10 +174,10 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, trials_unseen.tar[mask] = False enr_u2e.sort(1) - enr_u2e.save(output_dir / 'utt2enr') - trials_all.save_txt(output_dir / 'trials') - trials_seen.save_txt(output_dir / 'trials_seen') - trials_unseen.save_txt(output_dir / 'trials_unseen') + enr_u2e.save(output_dir / "utt2enr") + trials_all.save_txt(output_dir / "trials") + trials_seen.save_txt(output_dir / "trials_seen") + trials_unseen.save_txt(output_dir / "trials_unseen") # train_u2d.save(output_dir / 'train_utt2dur') # val_u2d.save(output_dir / 'val_utt2dur') @@ -183,26 +187,28 @@ def make_lists(input_dir, seen_attacks, benign_wav_file, output_dir, max_trials, # with open(output_dir / 'class2int', 'w') as f: # for c in uclasses: # f.write('%s\n' % (c)) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='prepare trial list to do attack type verification') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--seen-attacks', required=True, nargs='+') - parser.add_argument('--benign-wav-file', required=True) - parser.add_argument('--num-enroll-sides', default=1, type=int) - parser.add_argument('--max-trials', default=10, type=float) - #parser.add_argument('--benign-durs', required=True) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="prepare trial list to do attack type verification", + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--seen-attacks", required=True, nargs="+") + parser.add_argument("--benign-wav-file", required=True) + parser.add_argument("--num-enroll-sides", default=1, type=int) + parser.add_argument("--max-trials", default=10, type=float) + # parser.add_argument('--benign-durs', required=True) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v2.py b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v2.py index 2fd75fa1..8ae69790 100755 --- a/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v2.py +++ b/egs/voxceleb/adv.v2/local/make_trials_exp_attack_type_verif_v2.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,48 +23,59 @@ from hyperion.utils import Utt2Info, SCPList, TrialKey -def make_lists(input_dir, known_attacks, output_dir, - min_snr, max_snr, success_category, - max_trials, num_enroll_sides): +def make_lists( + input_dir, + known_attacks, + output_dir, + min_snr, + max_snr, + success_category, + max_trials, + num_enroll_sides, +): rng = np.random.RandomState(seed=1234) input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(input_dir / 'test_attack_info.yaml', 'r') as f: + with open(input_dir / "test_attack_info.yaml", "r") as f: test_attacks = yaml.load(f, Loader=yaml.FullLoader) keys = [] files = [] classes = [] - for k,v in test_attacks.items(): - s = v['success'] - if not (success_category == 'both' or - success_category == 'success' and s or - success_category == 'fail' and not s): + for k, v in test_attacks.items(): + s = v["success"] + if not ( + success_category == "both" + or success_category == "success" + and s + or success_category == "fail" + and not s + ): continue - snr = v['snr'] + snr = v["snr"] if snr < min_snr or snr > max_snr: continue keys.append(k) - files.append(v['wav_path']) - classes.append(v['attack_type']) - keys.append(v['key_benign']) - files.append(v['wav_benign']) - classes.append('benign') + files.append(v["wav_path"]) + classes.append(v["attack_type"]) + keys.append(v["key_benign"]) + files.append(v["wav_benign"]) + classes.append("benign") u2c = Utt2Info.create(keys, classes) wav = SCPList(keys, files) ##### - u2c.save(output_dir / 'utt2attack') - wav.save(output_dir / 'wav.scp') + u2c.save(output_dir / "utt2attack") + wav.save(output_dir / "wav.scp") - mask = rng.rand(len(u2c)) > 1/(num_enroll_sides + 1) + mask = rng.rand(len(u2c)) > 1 / (num_enroll_sides + 1) enr_key = u2c.key[mask] - test_key = u2c.key[mask==False] + test_key = u2c.key[mask == False] enr_u2c = u2c.filter(enr_key) test_u2c = u2c.filter(test_key) @@ -74,7 +90,7 @@ def make_lists(input_dir, known_attacks, output_dir, side = count_sides[j] % num_enroll_sides if side == 0: count_models[j] += 1 - enroll_model = '%s-%03d' % (class_uniq[j], count_models[j]) + enroll_model = "%s-%03d" % (class_uniq[j], count_models[j]) enroll_models.append(enroll_model) count_sides[j] += 1 @@ -84,33 +100,38 @@ def make_lists(input_dir, known_attacks, output_dir, else: enr_e2c = enr_u2c enr_u2e = Utt2Info.create(enr_u2c.key, enr_u2c.key) - + trials_all = TrialKey(enr_e2c.key, test_u2c.key) trials_known = TrialKey(enr_e2c.key, test_u2c.key) trials_unknown = TrialKey(enr_e2c.key, test_u2c.key) for i in range(len(enr_e2c)): for j in range(len(test_u2c)): if enr_e2c.info[i] == test_u2c.info[j]: - trials_all.tar[i,j] = True + trials_all.tar[i, j] = True if enr_e2c.info[i] in known_attacks: - trials_known.tar[i,j] = True + trials_known.tar[i, j] = True else: - trials_unknown.tar[i,j] = True + trials_unknown.tar[i, j] = True else: - trials_all.non[i,j] = True - if enr_e2c.info[i] in known_attacks and test_u2c.info[j] in known_attacks: - trials_known.non[i,j] = True - elif enr_e2c.info[i] not in known_attacks and test_u2c.info[j] not in known_attacks: - trials_unknown.non[i,j] = True - + trials_all.non[i, j] = True + if ( + enr_e2c.info[i] in known_attacks + and test_u2c.info[j] in known_attacks + ): + trials_known.non[i, j] = True + elif ( + enr_e2c.info[i] not in known_attacks + and test_u2c.info[j] not in known_attacks + ): + trials_unknown.non[i, j] = True max_trials = int(max_trials * 1e6) - num_tar_trials = np.sum(trials_all.tar) + num_tar_trials = np.sum(trials_all.tar) num_non_trials = np.sum(trials_all.non) num_trials = num_tar_trials + num_non_trials if num_trials > max_trials: p = max_trials / num_trials - logging.info('reducing number of trials (%d) with p=%f' % (num_trials, p)) + logging.info("reducing number of trials (%d) with p=%f" % (num_trials, p)) mask = rng.rand(*trials_all.tar.shape) > p trials_all.non[mask] = False trials_known.non[mask] = False @@ -120,34 +141,41 @@ def make_lists(input_dir, known_attacks, output_dir, trials_unknown.tar[mask] = False enr_u2e.sort(1) - enr_u2e.save(output_dir / 'utt2enr') - trials_all.save_txt(output_dir / 'trials') - trials_known.save_txt(output_dir / 'trials_known') - trials_unknown.save_txt(output_dir / 'trials_unknown') - + enr_u2e.save(output_dir / "utt2enr") + trials_all.save_txt(output_dir / "trials") + trials_known.save_txt(output_dir / "trials_known") + trials_unknown.save_txt(output_dir / "trials_unknown") + if __name__ == "__main__": parser = ArgumentParser( - description='prepare trial list to do attack type verification') - - parser.add_argument('--input-dir', required=True) - parser.add_argument('--known-attacks', required=True, nargs='+') - #parser.add_argument('--benign-wav-file', required=True) - #parser.add_argument('--benign-durs', required=True) - parser.add_argument('--num-enroll-sides', default=1, type=int) - parser.add_argument('--max-trials', default=10, type=float, - help='maximum number of trials in millions') - parser.add_argument('--success-category', default='success', - choices=['success', 'fail', 'both']) - parser.add_argument('--min-snr', default=-10, type=float) - parser.add_argument('--max-snr', default=100, type=float) - - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="prepare trial list to do attack type verification" + ) + + parser.add_argument("--input-dir", required=True) + parser.add_argument("--known-attacks", required=True, nargs="+") + # parser.add_argument('--benign-wav-file', required=True) + # parser.add_argument('--benign-durs', required=True) + parser.add_argument("--num-enroll-sides", default=1, type=int) + parser.add_argument( + "--max-trials", + default=10, + type=float, + help="maximum number of trials in millions", + ) + parser.add_argument( + "--success-category", default="success", choices=["success", "fail", "both"] + ) + parser.add_argument("--min-snr", default=-10, type=float) + parser.add_argument("--max-snr", default=100, type=float) + + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/local/score_dcf.py b/egs/voxceleb/adv.v2/local/score_dcf.py index 85c82f2b..50babe69 100755 --- a/egs/voxceleb/adv.v2/local/score_dcf.py +++ b/egs/voxceleb/adv.v2/local/score_dcf.py @@ -19,50 +19,57 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = SparseTrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = SparseTrialScores.load_txt(score_file) - logging.info('separating tar/non') + logging.info("separating tar/non") tar, non = scr.get_tar_non(key) - logging.info('computing EER/DCF') - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) + logging.info("computing EER/DCF") + priors = np.array([0.001, 0.005, 0.01, 0.05]) min_dcf, act_dcf, eer, _ = fast_eval(tar, non, priors) - + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0], - len(tar), len(non)) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + len(tar), + len(non), + ) f.write(s) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/voxceleb/adv.v2/local/split_train_test.py b/egs/voxceleb/adv.v2/local/split_train_test.py index 5bd6a304..7af6b019 100755 --- a/egs/voxceleb/adv.v2/local/split_train_test.py +++ b/egs/voxceleb/adv.v2/local/split_train_test.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,24 +23,23 @@ from hyperion.utils import Utt2Info -def split_train_test(attack_info_file, train_list, test_list, - p_val, output_dir): +def split_train_test(attack_info_file, train_list, test_list, p_val, output_dir): rng = np.random.RandomState(seed=1234) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) - with open(attack_info_file, 'r') as f: + with open(attack_info_file, "r") as f: attack_info = yaml.load(f, Loader=yaml.FullLoader) benign_to_attack = {} for k, v in attack_info.items(): - bk = v['key_original'] + bk = v["key_original"] if bk in benign_to_attack: benign_to_attack[bk].append(k) else: benign_to_attack[bk] = [k] - + train_utts = Utt2Info.load(train_list) train_val_keys = train_utts.key # split in train and val @@ -50,13 +54,12 @@ def split_train_test(attack_info_file, train_list, test_list, attacks_k = {} for ak in benign_to_attack[k]: attacks_k[ak] = attack_info[ak] - + p = rng.rand(1) if p < p_val: val_info.update(attacks_k) else: train_info.update(attacks_k) - test_utts = Utt2Info.load(test_list) test_val_keys = test_utts.key @@ -69,31 +72,32 @@ def split_train_test(attack_info_file, train_list, test_list, for ak in benign_to_attack[k]: test_info[ak] = attack_info[ak] - with open(output_dir / 'train_attack_info.yaml', 'w') as f: + with open(output_dir / "train_attack_info.yaml", "w") as f: yaml.dump(train_info, f, sort_keys=True) - with open(output_dir / 'val_attack_info.yaml', 'w') as f: + with open(output_dir / "val_attack_info.yaml", "w") as f: yaml.dump(val_info, f, sort_keys=True) - with open(output_dir / 'test_attack_info.yaml', 'w') as f: + with open(output_dir / "test_attack_info.yaml", "w") as f: yaml.dump(test_info, f, sort_keys=True) - if __name__ == "__main__": parser = ArgumentParser( - description='Split Yaml attacks info file into train/val/test') - - parser.add_argument('--attack-info-file', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--test-list', required=True) - parser.add_argument('--p-val', default=0.1, type=float) - parser.add_argument('--output-dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + description="Split Yaml attacks info file into train/val/test" + ) + + parser.add_argument("--attack-info-file", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--test-list", required=True) + parser.add_argument("--p-val", default=0.1, type=float) + parser.add_argument("--output-dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py index 8af7ccd2..4b017114 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-Nvs1-v1.py @@ -21,71 +21,93 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, eval_method, - model_part_idx, num_model_parts, - seg_part_idx, num_seg_parts, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + eval_method, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc, - model_part_idx, num_model_parts, seg_part_idx, num_seg_parts) + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_Nvs1(x_e, x_t, ids1=ids_e, method=eval_method) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) if num_model_parts > 1 or num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, model_part_idx, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA') - - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--eval-method', choices=['book', 'vavg-lnorm'], default='vavg-lnorm') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument( + "--eval-method", choices=["book", "vavg-lnorm"], default="vavg-lnorm" + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py index 4f8563f9..0b3c9125 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos-Nvs1.py @@ -22,65 +22,84 @@ from hyperion.transforms import TransformList, LNorm -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, score_file, - model_part_idx, num_model_parts, - seg_part_idx, num_seg_parts, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc, - model_part_idx, num_model_parts, seg_part_idx, num_seg_parts) + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) x_e, x_t, enroll, ndx = tdr.read() enroll, ids_e = np.unique(enroll, return_inverse=True) t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") D_e = PLDA.compute_stats_hard(x_e, class_ids=ids_e) - x_e=D_e[1]/np.expand_dims(D_e[0], axis=-1) + x_e = D_e[1] / np.expand_dims(D_e[0], axis=-1) scores = cosine_scoring(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) if num_model_parts > 1 or num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, model_part_idx, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval cosine-scoring') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval cosine-scoring", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py index 41f0cd1c..0438e373 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-cos.py @@ -6,7 +6,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -20,61 +25,78 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, score_file, - model_part_idx, num_model_parts, - seg_part_idx, num_seg_parts, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc, - model_part_idx, num_model_parts, seg_part_idx, num_seg_parts) + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) x_e, x_t, enroll, ndx = tdr.read() t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = cosine_scoring(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) if num_model_parts > 1 or num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, model_part_idx, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) s.save_txt(score_file) - if __name__ == "__main__": - parser=ArgumentParser( - description='Eval cosine-scoring') + parser = ArgumentParser(description="Eval cosine-scoring") - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**namespace_to_dict(args)) - - diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py b/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py index 8263ef33..3ebac1f6 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-be-novelty.py @@ -23,19 +23,25 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF - -def eval_plda(iv_file, ndx_file, enroll_file, - preproc_file, - model_file, score_file, plda_type, eval_method, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + preproc_file, + model_file, + score_file, + plda_type, + eval_method, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - logging.info('loading data') + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -48,52 +54,55 @@ def eval_plda(iv_file, ndx_file, enroll_file, x_e = reader.read(u2s.key, squeeze=True) enroll, ids_e = np.unique(u2s.info, return_inverse=True) - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") x_e = preproc.predict(x_e) x_t = preproc.predict(x_t) scores = model.llr_Nvs1(x_e, x_t, ids1=ids_e, method=eval_method) - scores = - logsumexp(scores, axis=0, keepdims=True) + len(enroll) - + scores = -logsumexp(scores, axis=0, keepdims=True) + len(enroll) + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) - model_set = ['known'] - logging.info('saving scores to %s' % (score_file)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) + model_set = ["known"] + logging.info("saving scores to %s" % (score_file)) s = TrialScores(model_set, ndx.seg_set, scores) s.save_txt(score_file) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--eval-method', choices=['book', 'vavg-lnorm'], default='vavg-lnorm') + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument( + "--eval-method", choices=["book", "vavg-lnorm"], default="vavg-lnorm" + ) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_plda(**vars(args)) - - diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py index cdf5c7b9..630bc244 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unknown-attacks-noimp.py @@ -11,31 +11,36 @@ import numpy as np import pandas as pd -#import matplotlib -#matplotlib.use('Agg') -#import matplotlib.pyplot as plt + +# import matplotlib +# matplotlib.use('Agg') +# import matplotlib.pyplot as plt from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import compute_confusion_matrix, print_confusion_matrix +from hyperion.metrics.confusion_matrix import ( + compute_confusion_matrix, + print_confusion_matrix, +) from hyperion.transforms import PCA, LNorm from hyperion.pdfs import SPLDA from numpy.linalg import matrix_rank -#colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -#markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] +# colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] +# markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] -#color_marker = [(c,m) for m in markers for c in colors] +# color_marker = [(c,m) for m in markers for c in colors] def read_class_file(class_file): - - class_info = pd.read_csv(class_file, header=None, sep=' ') + + class_info = pd.read_csv(class_file, header=None, sep=" ") classes = class_info[0] - class2ids = {str(k):i for i,k in enumerate(class_info[0])} - return classes, class2ids + class2ids = {str(k): i for i, k in enumerate(class_info[0])} + return classes, class2ids + def eval_classif_perf(v_file, key_file, class_file, output_path=None, **kwargs): @@ -49,83 +54,84 @@ def eval_classif_perf(v_file, key_file, class_file, output_path=None, **kwargs): del reader class_names = np.asarray(utts.info) - class_names[class_names=='imp'] = 'pgd-linf' + class_names[class_names == "imp"] = "pgd-linf" classes, class_ids = np.unique(class_names, return_inverse=True) - #divide train and test + # divide train and test mask = rng.rand(len(x)) < 0.3 x_train = x[mask] class_ids_train = class_ids[mask] - x_test = x[mask==False] - y_true = class_ids[mask==False] + x_test = x[mask == False] + y_true = class_ids[mask == False] rank = matrix_rank(x_train) # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x_train) x_train = pca.predict(x_train) x_test = pca.predict(x_test) - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_train) x_train = lnorm.predict(x_train) x_test = lnorm.predict(x_test) - plda = SPLDA(y_dim=min(max(class_ids)+1, x_train.shape[1])) + plda = SPLDA(y_dim=min(max(class_ids) + 1, x_train.shape[1])) plda.fit(x_train, class_ids=class_ids_train) - y = plda.llr_Nvs1(x_train, x_test, ids1=class_ids_train, method='book').T + y = plda.llr_Nvs1(x_train, x_test, ids1=class_ids_train, method="book").T y_pred = np.argmax(y, axis=1) acc = compute_accuracy(y_true, y_pred) - logging.info('Classification accuracy %.2f %%' % (acc * 100)) + logging.info("Classification accuracy %.2f %%" % (acc * 100)) labels = np.arange(len(classes), dtype=np.int) C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) - logging.info('Unnormalized Confusion Matrix:') - print_confusion_matrix(C, labels_true = classes) + logging.info("Unnormalized Confusion Matrix:") + print_confusion_matrix(C, labels_true=classes) Cn = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=True) - logging.info('Normalized Confusion Matrix:') - print_confusion_matrix(Cn*100, labels_true = classes, fmt='.1f') + logging.info("Normalized Confusion Matrix:") + print_confusion_matrix(Cn * 100, labels_true=classes, fmt=".1f") - #remove benign class, which has id=0 + # remove benign class, which has id=0 mask = y_true > 0 - y = y[mask,1:] + y = y[mask, 1:] y_true = y_true[mask] - 1 - logging.info('without benign class') + logging.info("without benign class") y_pred = np.argmax(y, axis=1) acc = compute_accuracy(y_true, y_pred) - logging.info('Classification accuracy %.2f %%' % (acc * 100)) + logging.info("Classification accuracy %.2f %%" % (acc * 100)) - labels = np.arange(len(classes)-1, dtype=np.int) + labels = np.arange(len(classes) - 1, dtype=np.int) C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) - logging.info('Unnormalized Confusion Matrix:') - print_confusion_matrix(C, labels_true = classes[1:]) + logging.info("Unnormalized Confusion Matrix:") + print_confusion_matrix(C, labels_true=classes[1:]) Cn = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=True) - logging.info('Normalized Confusion Matrix:') - print_confusion_matrix(Cn*100, labels_true = classes[1:], fmt='.1f') + logging.info("Normalized Confusion Matrix:") + print_confusion_matrix(Cn * 100, labels_true=classes[1:], fmt=".1f") - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evaluates attack classification accuracy') + fromfile_prefix_chars="@", + description="Evaluates attack classification accuracy", + ) - parser.add_argument('--v-file', required=True) - parser.add_argument('--key-file', required=True) - parser.add_argument('--class-file', required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--class-file", required=True) - #parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + # parser.add_argument('--output-path', dest='output_path', required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_classif_perf(**vars(args)) + eval_classif_perf(**vars(args)) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py index 4df15268..5ad87f72 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-plda-unkown-attacks.py @@ -11,31 +11,36 @@ import numpy as np import pandas as pd -#import matplotlib -#matplotlib.use('Agg') -#import matplotlib.pyplot as plt + +# import matplotlib +# matplotlib.use('Agg') +# import matplotlib.pyplot as plt from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import compute_confusion_matrix, print_confusion_matrix +from hyperion.metrics.confusion_matrix import ( + compute_confusion_matrix, + print_confusion_matrix, +) from hyperion.transforms import PCA, LNorm from hyperion.pdfs import SPLDA from numpy.linalg import matrix_rank -#colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -#markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] +# colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] +# markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] -#color_marker = [(c,m) for m in markers for c in colors] +# color_marker = [(c,m) for m in markers for c in colors] def read_class_file(class_file): - - class_info = pd.read_csv(class_file, header=None, sep=' ') + + class_info = pd.read_csv(class_file, header=None, sep=" ") classes = class_info[0] - class2ids = {str(k):i for i,k in enumerate(class_info[0])} - return classes, class2ids + class2ids = {str(k): i for i, k in enumerate(class_info[0])} + return classes, class2ids + def eval_classif_perf(v_file, key_file, class_file, output_path=None, **kwargs): @@ -49,81 +54,82 @@ def eval_classif_perf(v_file, key_file, class_file, output_path=None, **kwargs): del reader classes, class_ids = np.unique(utts.info, return_inverse=True) - #divide train and test + # divide train and test mask = rng.rand(len(x)) < 0.3 x_train = x[mask] class_ids_train = class_ids[mask] - x_test = x[mask==False] - y_true = class_ids[mask==False] + x_test = x[mask == False] + y_true = class_ids[mask == False] rank = matrix_rank(x_train) # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x_train) x_train = pca.predict(x_train) x_test = pca.predict(x_test) - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x_train) x_train = lnorm.predict(x_train) x_test = lnorm.predict(x_test) - plda = SPLDA(y_dim=min(max(class_ids)+1, x_train.shape[1])) + plda = SPLDA(y_dim=min(max(class_ids) + 1, x_train.shape[1])) plda.fit(x_train, class_ids=class_ids_train) - y = plda.llr_Nvs1(x_train, x_test, ids1=class_ids_train, method='book').T + y = plda.llr_Nvs1(x_train, x_test, ids1=class_ids_train, method="book").T y_pred = np.argmax(y, axis=1) acc = compute_accuracy(y_true, y_pred) - logging.info('Classification accuracy %.2f %%' % (acc * 100)) + logging.info("Classification accuracy %.2f %%" % (acc * 100)) labels = np.arange(len(classes), dtype=np.int) C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) - logging.info('Unnormalized Confusion Matrix:') - print_confusion_matrix(C, labels_true = classes) + logging.info("Unnormalized Confusion Matrix:") + print_confusion_matrix(C, labels_true=classes) Cn = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=True) - logging.info('Normalized Confusion Matrix:') - print_confusion_matrix(Cn*100, labels_true = classes, fmt='.1f') + logging.info("Normalized Confusion Matrix:") + print_confusion_matrix(Cn * 100, labels_true=classes, fmt=".1f") - #remove benign class, which has id=0 + # remove benign class, which has id=0 mask = y_true > 0 - y = y[mask,1:] + y = y[mask, 1:] y_true = y_true[mask] - 1 - logging.info('without benign class') + logging.info("without benign class") y_pred = np.argmax(y, axis=1) acc = compute_accuracy(y_true, y_pred) - logging.info('Classification accuracy %.2f %%' % (acc * 100)) + logging.info("Classification accuracy %.2f %%" % (acc * 100)) - labels = np.arange(len(classes)-1, dtype=np.int) + labels = np.arange(len(classes) - 1, dtype=np.int) C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) - logging.info('Unnormalized Confusion Matrix:') - print_confusion_matrix(C, labels_true = classes[1:]) + logging.info("Unnormalized Confusion Matrix:") + print_confusion_matrix(C, labels_true=classes[1:]) Cn = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=True) - logging.info('Normalized Confusion Matrix:') - print_confusion_matrix(Cn*100, labels_true = classes[1:], fmt='.1f') + logging.info("Normalized Confusion Matrix:") + print_confusion_matrix(Cn * 100, labels_true=classes[1:], fmt=".1f") - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evaluates attack classification accuracy') + fromfile_prefix_chars="@", + description="Evaluates attack classification accuracy", + ) - parser.add_argument('--v-file', required=True) - parser.add_argument('--key-file', required=True) - parser.add_argument('--class-file', required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--class-file", required=True) - #parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + # parser.add_argument('--output-path', dest='output_path', required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_classif_perf(**vars(args)) + eval_classif_perf(**vars(args)) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py index f6214102..e8dd6e00 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf-unknown-attacks.py @@ -11,28 +11,33 @@ import numpy as np import pandas as pd -#import matplotlib -#matplotlib.use('Agg') -#import matplotlib.pyplot as plt + +# import matplotlib +# matplotlib.use('Agg') +# import matplotlib.pyplot as plt from hyperion.hyp_defs import config_logger from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import compute_xlabel_confusion_matrix, print_confusion_matrix +from hyperion.metrics.confusion_matrix import ( + compute_xlabel_confusion_matrix, + print_confusion_matrix, +) -#colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -#markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] +# colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] +# markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] -#color_marker = [(c,m) for m in markers for c in colors] +# color_marker = [(c,m) for m in markers for c in colors] def read_class_file(class_file): - - class_info = pd.read_csv(class_file, header=None, sep=' ') + + class_info = pd.read_csv(class_file, header=None, sep=" ") classes = class_info[0] - class2ids = {str(k):i for i,k in enumerate(class_info[0])} - return classes, class2ids + class2ids = {str(k): i for i, k in enumerate(class_info[0])} + return classes, class2ids + def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwargs): @@ -46,36 +51,44 @@ def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwar y_pred = np.argmax(y, axis=1) acc = compute_accuracy(y_true, y_pred) - logging.info('Classification accuracy %.2f %%' % (acc * 100)) + logging.info("Classification accuracy %.2f %%" % (acc * 100)) labels_train = np.arange(len(classes), dtype=np.int) - C = compute_xlabel_confusion_matrix(y_true, y_pred, labels_train=labels_train, normalize=False) + C = compute_xlabel_confusion_matrix( + y_true, y_pred, labels_train=labels_train, normalize=False + ) - logging.info('Unnormalized Confusion Matrix:') + logging.info("Unnormalized Confusion Matrix:") print_confusion_matrix(C, labels_true=classes_test, labels_pred=classes) - Cn = compute_xlabel_confusion_matrix(y_true, y_pred, labels_train=labels_train, normalize=True) - logging.info('Normalized Confusion Matrix:') - print_confusion_matrix(Cn*100, labels_true=classes_test, labels_pred=classes, fmt='.1f') - + Cn = compute_xlabel_confusion_matrix( + y_true, y_pred, labels_train=labels_train, normalize=True + ) + logging.info("Normalized Confusion Matrix:") + print_confusion_matrix( + Cn * 100, labels_true=classes_test, labels_pred=classes, fmt=".1f" + ) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evaluates attack classification accuracy') + fromfile_prefix_chars="@", + description="Evaluates attack classification accuracy", + ) - parser.add_argument('--score-file', required=True) - parser.add_argument('--key-file', required=True) - parser.add_argument('--class-file', required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--class-file", required=True) - #parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + # parser.add_argument('--output-path', dest='output_path', required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_classif_perf(**vars(args)) + eval_classif_perf(**vars(args)) diff --git a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py index fa890816..6b259a2f 100755 --- a/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py +++ b/egs/voxceleb/adv.v2/steps_backend/eval-classif-perf.py @@ -6,7 +6,12 @@ import logging import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import numpy as np @@ -16,22 +21,21 @@ from hyperion.utils import Utt2Info from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.metrics.acc import compute_accuracy -from hyperion.metrics.confusion_matrix import compute_confusion_matrix, print_confusion_matrix +from hyperion.metrics.confusion_matrix import ( + compute_confusion_matrix, + print_confusion_matrix, +) def read_class_file(class_file): - class_info = pd.read_csv(class_file, header=None, sep=' ') + class_info = pd.read_csv(class_file, header=None, sep=" ") classes = class_info[0] class2ids = {str(k): i for i, k in enumerate(class_info[0])} return classes, class2ids -def eval_classif_perf(score_file, - key_file, - class_file, - output_path=None, - **kwargs): +def eval_classif_perf(score_file, key_file, class_file, output_path=None, **kwargs): utts = Utt2Info.load(key_file) classes, class2ids = read_class_file(class_file) @@ -46,40 +50,30 @@ def eval_classif_perf(score_file, y_pred = np.argmax(y, axis=1) acc = compute_accuracy(y_true, y_pred) - logging.info('Classification accuracy %.2f %%' % (acc * 100)) + logging.info("Classification accuracy %.2f %%" % (acc * 100)) labels = np.arange(len(classes), dtype=np.int) - C = compute_confusion_matrix(y_true, - y_pred, - labels=labels, - normalize=False) - logging.info('Unnormalized Confusion Matrix:') + C = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=False) + logging.info("Unnormalized Confusion Matrix:") print_confusion_matrix(C, labels_true=classes) - Cn = compute_confusion_matrix(y_true, - y_pred, - labels=labels, - normalize=True) - logging.info('Normalized Confusion Matrix:') - print_confusion_matrix(Cn * 100, labels_true=classes, fmt='.1f') + Cn = compute_confusion_matrix(y_true, y_pred, labels=labels, normalize=True) + logging.info("Normalized Confusion Matrix:") + print_confusion_matrix(Cn * 100, labels_true=classes, fmt=".1f") if __name__ == "__main__": - parser = ArgumentParser( - description='Evaluates attack classification accuracy') + parser = ArgumentParser(description="Evaluates attack classification accuracy") - parser.add_argument('--score-file', required=True) - parser.add_argument('--key-file', required=True) - parser.add_argument('--class-file', required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument("--key-file", required=True) + parser.add_argument("--class-file", required=True) - #parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) + # parser.add_argument('--output-path', dest='output_path', required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py b/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py index 0de842ba..8ba93714 100755 --- a/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py +++ b/egs/voxceleb/adv.v2/steps_backend/train-be-v1.py @@ -18,10 +18,19 @@ from numpy.linalg import matrix_rank -def train_be(iv_file, train_list, - plda_type, y_dim, z_dim, - epochs, ml_md, md_epochs, - output_path, **kwargs): + +def train_be( + iv_file, + train_list, + plda_type, + y_dim, + z_dim, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): # Read data vcr_args = VCR.filter_args(**kwargs) @@ -34,30 +43,28 @@ def train_be(iv_file, train_list, pca = None if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) if y_dim > rank: y_dim = rank - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train centering and whitening t1 = time.time() - lnorm = LNorm(name='lnorm') + lnorm = LNorm(name="lnorm") lnorm.fit(x) x_ln = lnorm.predict(x) - logging.info('LNorm Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("LNorm Elapsed time: %.2f s." % (time.time() - t1)) + # Train PLDA t1 = time.time() - plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, - name='plda') - elbo = plda.fit(x_ln, class_ids, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + plda = F.create_plda(plda_type, y_dim=y_dim, z_dim=z_dim, name="plda") + elbo = plda.fit(x_ln, class_ids, epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) - logging.info('PLDA Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("PLDA Elapsed time: %.2f s." % (time.time() - t1)) # Save models if pca is None: @@ -68,35 +75,36 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/lnorm.h5') - plda.save(output_path + '/plda.h5') + preproc.save(output_path + "/lnorm.h5") + plda.save(output_path + "/plda.h5") num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - np.savetxt(output_path + '/elbo.csv', elbo, delimiter=',') - - + np.savetxt(output_path + "/elbo.csv", elbo, delimiter=",") + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Back-end') + fromfile_prefix_chars="@", + description="Train Back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_be(**vars(args)) - + train_be(**vars(args)) diff --git a/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py b/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py index d3471c2a..03fa3325 100755 --- a/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py +++ b/egs/voxceleb/adv.v2/steps_visual/proj-attack-lda.py @@ -6,13 +6,19 @@ import logging import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import numpy as np import pandas as pd import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt @@ -21,14 +27,15 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.transforms import LDA -colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + +color_marker = [(c, m) for m in markers for c in colors] -color_marker = [(c,m) for m in markers for c in colors] -def proj_attack_lda(train_v_file, train_list, - test_v_file, test_list, - lda_cols, title, output_path): +def proj_attack_lda( + train_v_file, train_list, test_v_file, test_list, lda_cols, title, output_path +): train_utts = Utt2Info.load(train_list) test_utts = Utt2Info.load(test_list) @@ -42,7 +49,7 @@ def proj_attack_lda(train_v_file, train_list, del test_reader lda_cols = np.asarray(lda_cols) - 1 - lda_classes = train_utts.info[:,lda_cols] + lda_classes = train_utts.info[:, lda_cols] if lda_classes.shape[1] > 1: new_classes = [] for i in range(lda_classes.shape[0]): @@ -51,67 +58,75 @@ def proj_attack_lda(train_v_file, train_list, lda_classes, class_ids = np.unique(lda_classes, return_inverse=True) lda = LDA(lda_dim=2) - + lda.fit(x_trn, class_ids) x_lda_trn = lda.predict(x_trn) x_lda_test = lda.predict(x_test) p_test = np.random.rand(x_test.shape[0]) < 0.05 x_lda_test = x_lda_test[p_test] for col in range(3): - fig_file = '%s/train_lda_%d.png' % (output_path, col) + fig_file = "%s/train_lda_%d.png" % (output_path, col) classes = train_utts.info[:, col] classes, class_ids = np.unique(classes, return_inverse=True) - for c in range(np.max(class_ids)+1): + for c in range(np.max(class_ids) + 1): idx = class_ids == c - plt.scatter(x_lda_trn[idx,0], x_lda_trn[idx,1], - c=color_marker[c][0], marker=color_marker[c][1], - label=classes[c]) - + plt.scatter( + x_lda_trn[idx, 0], + x_lda_trn[idx, 1], + c=color_marker[c][0], + marker=color_marker[c][1], + label=classes[c], + ) + plt.legend() plt.grid(True) plt.show() - plt.title('Train-set LDA(%s)' % title) + plt.title("Train-set LDA(%s)" % title) plt.savefig(fig_file) plt.clf() - fig_file = '%s/test_lda_%d.png' % (output_path, col) + fig_file = "%s/test_lda_%d.png" % (output_path, col) classes = test_utts.info[p_test, col] classes, class_ids = np.unique(classes, return_inverse=True) - for c in range(np.max(class_ids)+1): + for c in range(np.max(class_ids) + 1): idx = class_ids == c - plt.scatter(x_lda_test[idx,0], x_lda_test[idx,1], - c=color_marker[c][0], marker=color_marker[c][1], - label=classes[c]) - + plt.scatter( + x_lda_test[idx, 0], + x_lda_test[idx, 1], + c=color_marker[c][0], + marker=color_marker[c][1], + label=classes[c], + ) + plt.legend() plt.grid(True) plt.show() - plt.title('Test-set LDA(%s)' % title) + plt.title("Test-set LDA(%s)" % title) plt.savefig(fig_file) plt.clf() if __name__ == "__main__": - parser = ArgumentParser( - description='Proj x-vector with LDA to classify attacks') + parser = ArgumentParser(description="Proj x-vector with LDA to classify attacks") + + parser.add_argument("--train-v-file", required=True) + parser.add_argument("--train-list", required=True) - parser.add_argument('--train-v-file', required=True) - parser.add_argument('--train-list', required=True) + parser.add_argument("--test-v-file", required=True) + parser.add_argument("--test-list", required=True) - parser.add_argument('--test-v-file', required=True) - parser.add_argument('--test-list', required=True) + parser.add_argument("--lda-cols", type=int, nargs="+", required=True) + parser.add_argument("--title", default="") - parser.add_argument('--lda-cols', type=int, nargs='+', required=True) - parser.add_argument('--title', default='') - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - proj_attack_lda(**namespace_to_dict(args)) + proj_attack_lda(**namespace_to_dict(args)) diff --git a/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py b/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py index 8785f1f9..a76a6633 100755 --- a/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py +++ b/egs/voxceleb/adv.v2/steps_visual/proj-attack-tsne.py @@ -6,13 +6,19 @@ import logging import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import numpy as np import pandas as pd import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt @@ -21,13 +27,15 @@ from hyperion.io import RandomAccessDataReaderFactory as DRF from hyperion.transforms import PCA, SklTSNE, LNorm -colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + +color_marker = [(c, m) for m in markers for c in colors] -color_marker = [(c,m) for m in markers for c in colors] -def proj_attack_tsne(train_v_file, train_list, - pca_var_r, prob_plot, lnorm, title, output_path, **kwargs): +def proj_attack_tsne( + train_v_file, train_list, pca_var_r, prob_plot, lnorm, title, output_path, **kwargs +): train_utts = Utt2Info.load(train_list) @@ -42,26 +50,30 @@ def proj_attack_tsne(train_v_file, train_list, pca = PCA(pca_var_r=pca_var_r) pca.fit(x_trn) x_pca = pca.predict(x_trn) - logging.info('pca-dim={}'.format(x_pca.shape[1])) + logging.info("pca-dim={}".format(x_pca.shape[1])) else: x_pca = x_trn - tsne_args = SklTSNE.filter_args(**kwargs['tsne']) + tsne_args = SklTSNE.filter_args(**kwargs["tsne"]) tsne = SklTSNE(**tsne_args) x_tsne = tsne.fit(x_pca) p = np.random.rand(x_tsne.shape[0]) < prob_plot x_tsne = x_tsne[p] for col in range(1): - fig_file = '%s/train_tsne_%d.png' % (output_path, col) + fig_file = "%s/train_tsne_%d.png" % (output_path, col) classes = train_utts.info[p] classes, class_ids = np.unique(classes, return_inverse=True) - for c in range(np.max(class_ids)+1): + for c in range(np.max(class_ids) + 1): idx = class_ids == c - plt.scatter(x_tsne[idx,0], x_tsne[idx,1], - c=color_marker[c][0], marker=color_marker[c][1], - label=classes[c]) - + plt.scatter( + x_tsne[idx, 0], + x_tsne[idx, 1], + c=color_marker[c][0], + marker=color_marker[c][1], + label=classes[c], + ) + plt.legend() plt.grid(True) plt.title(title) @@ -71,25 +83,25 @@ def proj_attack_tsne(train_v_file, train_list, if __name__ == "__main__": - parser = ArgumentParser( - description='Proj x-vector with TSNE to visualize attacks') + parser = ArgumentParser(description="Proj x-vector with TSNE to visualize attacks") - parser.add_argument('--train-v-file', required=True) - parser.add_argument('--train-list', required=True) + parser.add_argument("--train-v-file", required=True) + parser.add_argument("--train-list", required=True) - parser.add_argument('--pca-var-r', default=0.95, type=float) - parser.add_argument('--prob-plot', default=0.1, type=float) - parser.add_argument('--lnorm', default=False, action='store_true') - parser.add_argument('--title', default='') - SklTSNE.add_class_args(parser, prefix='tsne') - - parser.add_argument('--output-path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument("--pca-var-r", default=0.95, type=float) + parser.add_argument("--prob-plot", default=0.1, type=float) + parser.add_argument("--lnorm", default=False, action="store_true") + parser.add_argument("--title", default="") + SklTSNE.add_class_args(parser, prefix="tsne") - args=parser.parse_args() + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - proj_attack_tsne(**namespace_to_dict(args)) + proj_attack_tsne(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/local/attack_analysis.py b/egs/voxceleb/v1/local/attack_analysis.py index 26b41d24..8c74c6e9 100755 --- a/egs/voxceleb/v1/local/attack_analysis.py +++ b/egs/voxceleb/v1/local/attack_analysis.py @@ -15,134 +15,186 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import VerificationAdvAttackEvaluator as Eval +from hyperion.metrics.verification_evaluator import ( + VerificationAdvAttackEvaluator as Eval, +) -def evaluate_attacks(key_file, clean_score_file, attack_score_files, attack_stats_files, - output_path, prior): +def evaluate_attacks( + key_file, + clean_score_file, + attack_score_files, + attack_stats_files, + output_path, + prior, +): output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) - evaluator = Eval(key_file, clean_score_file, - attack_score_files, attack_stats_files, prior) + evaluator = Eval( + key_file, clean_score_file, attack_score_files, attack_stats_files, prior + ) # performance vs SNR - logging.info('compute perf vs snr for all trials') + logging.info("compute perf vs snr for all trials") df_clean = evaluator.compute_dcf_eer(return_df=True) - df_clean.insert(0, 'snr', np.inf) + df_clean.insert(0, "snr", np.inf) df = evaluator.compute_dcf_eer_vs_stats( - 'snr', [-10, 0, 10, 20, 30, 40, 50, 60], - 'all', higher_better=True, return_df=True) - file_path = '%s_attack_all_snr_results.csv' % (output_path) + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "all", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_all_snr_results.csv" % (output_path) df = pd.concat([df_clean, df], ignore_index=True) df.to_csv(file_path) - file_path = '%s_attack_all_snr' % (output_path) + file_path = "%s_attack_all_snr" % (output_path) evaluator.plot_dcf_eer_vs_stat_v1( - df, 'snr', file_path, clean_ref=0, xlabel='SNR(dB)', higher_better=True) + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) - logging.info('compute perf vs snr for tar trials') + logging.info("compute perf vs snr for tar trials") df = evaluator.compute_dcf_eer_vs_stats( - 'snr', [-10, 0, 10, 20, 30, 40, 50, 60], 'tar', - higher_better=True, return_df=True) - file_path = '%s_attack_tar_snr_results.csv' % (output_path) + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "tar", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_tar_snr_results.csv" % (output_path) df = pd.concat([df_clean, df], ignore_index=True) df.to_csv(file_path) - file_path = '%s_attack_tar_snr' % (output_path) + file_path = "%s_attack_tar_snr" % (output_path) evaluator.plot_dcf_eer_vs_stat_v1( - df, 'snr', file_path, clean_ref=0, xlabel='SNR(dB)', higher_better=True) + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) - logging.info('compute perf vs snr for non trials') + logging.info("compute perf vs snr for non trials") df = evaluator.compute_dcf_eer_vs_stats( - 'snr', [-10, 0, 10, 20, 30, 40, 50, 60], 'non', - higher_better=True, return_df=True) - file_path = '%s_attack_non_snr_results.csv' % (output_path) + "snr", + [-10, 0, 10, 20, 30, 40, 50, 60], + "non", + higher_better=True, + return_df=True, + ) + file_path = "%s_attack_non_snr_results.csv" % (output_path) df = pd.concat([df_clean, df], ignore_index=True) df.to_csv(file_path) - file_path = '%s_attack_non_snr' % (output_path) + file_path = "%s_attack_non_snr" % (output_path) evaluator.plot_dcf_eer_vs_stat_v1( - df, 'snr', file_path, clean_ref=0, xlabel='SNR(dB)', higher_better=True) + df, "snr", file_path, clean_ref=0, xlabel="SNR(dB)", higher_better=True + ) - logging.info('find best attacks from snr point of view') + logging.info("find best attacks from snr point of view") for i in range(len(attack_score_files)): - file_path = '%s_best_snr_tar_attacks_%d.csv' % (output_path, i) + file_path = "%s_best_snr_tar_attacks_%d.csv" % (output_path, i) evaluator.save_best_attacks( - file_path, 'snr', 'tar', num_best=10, min_delta=1, attack_idx=i, - higher_better=True) - - file_path = '%s_best_snr_non_attacks_%d.csv' % (output_path, i) + file_path, + "snr", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) + + file_path = "%s_best_snr_non_attacks_%d.csv" % (output_path, i) evaluator.save_best_attacks( - file_path, 'snr', 'non', num_best=10, min_delta=1, attack_idx=i, - higher_better=True) - + file_path, + "snr", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=True, + ) # performance vs Linf - logging.info('compute perf vs linf for all trials') - eps = np.ceil(np.asarray([0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1])*2**15) + logging.info("compute perf vs linf for all trials") + eps = np.ceil(np.asarray([0, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1]) * 2 ** 15) df = evaluator.compute_dcf_eer_vs_stats( - 'n_linf', eps, 'all', higher_better=False, return_df=True) - file_path = '%s_attack_all_linf_results.csv' % (output_path) + "n_linf", eps, "all", higher_better=False, return_df=True + ) + file_path = "%s_attack_all_linf_results.csv" % (output_path) df.to_csv(file_path) - file_path = '%s_attack_all_linf' % (output_path) + file_path = "%s_attack_all_linf" % (output_path) evaluator.plot_dcf_eer_vs_stat_v1( - df, 'n_linf', file_path, clean_ref=0, xlabel=r'$L_{\infty}$', log_x=True) + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) - logging.info('compute perf vs linf for tar trials') + logging.info("compute perf vs linf for tar trials") df = evaluator.compute_dcf_eer_vs_stats( - 'n_linf', eps, 'tar', higher_better=False, return_df=True) - file_path = '%s_attack_tar_linf_results.csv' % (output_path) + "n_linf", eps, "tar", higher_better=False, return_df=True + ) + file_path = "%s_attack_tar_linf_results.csv" % (output_path) df.to_csv(file_path) - file_path = '%s_attack_tar_linf' % (output_path) + file_path = "%s_attack_tar_linf" % (output_path) evaluator.plot_dcf_eer_vs_stat_v1( - df, 'n_linf', file_path, clean_ref=0, xlabel=r'$L_{\infty}$', log_x=True) + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) - logging.info('compute perf vs linf for non trials') + logging.info("compute perf vs linf for non trials") df = evaluator.compute_dcf_eer_vs_stats( - 'n_linf', eps, 'non', higher_better=False, return_df=True) - file_path = '%s_attack_non_linf_results.csv' % (output_path) + "n_linf", eps, "non", higher_better=False, return_df=True + ) + file_path = "%s_attack_non_linf_results.csv" % (output_path) df.to_csv(file_path) - file_path = '%s_attack_non_linf' % (output_path) + file_path = "%s_attack_non_linf" % (output_path) evaluator.plot_dcf_eer_vs_stat_v1( - df, 'n_linf', file_path, clean_ref=0, xlabel=r'$L_{\infty}$', log_x=True) + df, "n_linf", file_path, clean_ref=0, xlabel=r"$L_{\infty}$", log_x=True + ) - #find the best attacks in terms of linf - logging.info('find best attacks from linf point of view') + # find the best attacks in terms of linf + logging.info("find best attacks from linf point of view") for i in range(len(attack_score_files)): - file_path = '%s_best_linf_tar_attacks_%d.csv' % (output_path, i) + file_path = "%s_best_linf_tar_attacks_%d.csv" % (output_path, i) evaluator.save_best_attacks( - file_path, 'n_linf', 'tar', num_best=10, min_delta=1, attack_idx=i, - higher_better=False) - - file_path = '%s_best_linf_non_attacks_%d.csv' % (output_path, i) + file_path, + "n_linf", + "tar", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) + + file_path = "%s_best_linf_non_attacks_%d.csv" % (output_path, i) evaluator.save_best_attacks( - file_path, 'n_linf', 'non', num_best=10, min_delta=1, attack_idx=i, - higher_better=False) + file_path, + "n_linf", + "non", + num_best=10, + min_delta=1, + attack_idx=i, + higher_better=False, + ) - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Analyses performance of adversarial attacks for spk. verif.') - - parser.add_argument('--key-file', required=True) - parser.add_argument('--clean-score-file', required=True) - parser.add_argument('--attack-score-files', required=True, nargs='+') - parser.add_argument('--attack-stats-files', required=True, nargs='+') - parser.add_argument('--output-path', required=True) - parser.add_argument('--prior', default=0.05, type=float) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Analyses performance of adversarial attacks for spk. verif.", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--clean-score-file", required=True) + parser.add_argument("--attack-score-files", required=True, nargs="+") + parser.add_argument("--attack-stats-files", required=True, nargs="+") + parser.add_argument("--output-path", required=True) + parser.add_argument("--prior", default=0.05, type=float) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - evaluate_attacks(**vars(args)) - + evaluate_attacks(**vars(args)) diff --git a/egs/voxceleb/v1/local/make_musan.py b/egs/voxceleb/v1/local/make_musan.py index 8a93a18b..b0ae6846 100755 --- a/egs/voxceleb/v1/local/make_musan.py +++ b/egs/voxceleb/v1/local/make_musan.py @@ -7,125 +7,183 @@ import os, sys + def process_music_annotations(path): - utt2spk = {} - utt2vocals = {} - lines = open(path, 'r').readlines() - for line in lines: - utt, genres, vocals, musician = line.rstrip().split()[:4] - # For this application, the musican ID isn't important - utt2spk[utt] = utt - utt2vocals[utt] = vocals == "Y" - return utt2spk, utt2vocals + utt2spk = {} + utt2vocals = {} + lines = open(path, "r").readlines() + for line in lines: + utt, genres, vocals, musician = line.rstrip().split()[:4] + # For this application, the musican ID isn't important + utt2spk[utt] = utt + utt2vocals[utt] = vocals == "Y" + return utt2spk, utt2vocals def prepare_music(root_dir, fs, use_vocals): - utt2vocals = {} - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - music_dir = os.path.join(root_dir, "music") - for root, dirs, files in os.walk(music_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - elif str(file) == "ANNOTATIONS": - utt2spk_part, utt2vocals_part = process_music_annotations(file_path) - utt2spk.update(utt2spk_part) - utt2vocals.update(utt2vocals_part) - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2vocals: - if utt in utt2wav: - if use_vocals or not utt2vocals[utt]: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" + utt2vocals = {} + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + music_dir = os.path.join(root_dir, "music") + for root, dirs, files in os.walk(music_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + elif str(file) == "ANNOTATIONS": + utt2spk_part, utt2vocals_part = process_music_annotations(file_path) + utt2spk.update(utt2spk_part) + utt2vocals.update(utt2vocals_part) + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2vocals: + if utt in utt2wav: + if use_vocals or not utt2vocals[utt]: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In music directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + print("Missing file", utt) + num_bad_files += 1 + print( + "In music directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_speech(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - speech_dir = os.path.join(root_dir, "speech") - for root, dirs, files in os.walk(speech_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In speech directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + speech_dir = os.path.join(root_dir, "speech") + for root, dirs, files in os.walk(speech_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In speech directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def prepare_noise(root_dir, fs): - utt2spk = {} - utt2wav = {} - num_good_files = 0 - num_bad_files = 0 - noise_dir = os.path.join(root_dir, "noise") - for root, dirs, files in os.walk(noise_dir): - for file in files: - file_path = os.path.join(root, file) - if file.endswith(".wav"): - utt = str(file).replace(".wav", "") - utt2wav[utt] = file_path - utt2spk[utt] = utt - utt2spk_str = "" - utt2wav_str = "" - for utt in utt2spk: - if utt in utt2wav: - utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" - if fs == 8: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 8k -t wav - |\n" - else: - utt2wav_str = utt2wav_str + utt + " sox -t wav " + utt2wav[utt] + " -r 16k -t wav - |\n" - num_good_files += 1 - else: - print("Missing file", utt) - num_bad_files += 1 - print("In noise directory, processed", num_good_files, "files;", num_bad_files, "had missing wav data") - return utt2spk_str, utt2wav_str + utt2spk = {} + utt2wav = {} + num_good_files = 0 + num_bad_files = 0 + noise_dir = os.path.join(root_dir, "noise") + for root, dirs, files in os.walk(noise_dir): + for file in files: + file_path = os.path.join(root, file) + if file.endswith(".wav"): + utt = str(file).replace(".wav", "") + utt2wav[utt] = file_path + utt2spk[utt] = utt + utt2spk_str = "" + utt2wav_str = "" + for utt in utt2spk: + if utt in utt2wav: + utt2spk_str = utt2spk_str + utt + " " + utt2spk[utt] + "\n" + if fs == 8: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 8k -t wav - |\n" + ) + else: + utt2wav_str = ( + utt2wav_str + + utt + + " sox -t wav " + + utt2wav[utt] + + " -r 16k -t wav - |\n" + ) + num_good_files += 1 + else: + print("Missing file", utt) + num_bad_files += 1 + print( + "In noise directory, processed", + num_good_files, + "files;", + num_bad_files, + "had missing wav data", + ) + return utt2spk_str, utt2wav_str + def main(): - in_dir = sys.argv[1] - fs = int(sys.argv[2]) - out_dir = sys.argv[3] - use_vocals = sys.argv[4] == "Y" - utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) - utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) - utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) - utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise - utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise - wav_fi = open(os.path.join(out_dir, "wav.scp"), 'w') - wav_fi.write(utt2wav) - utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), 'w') - utt2spk_fi.write(utt2spk) + in_dir = sys.argv[1] + fs = int(sys.argv[2]) + out_dir = sys.argv[3] + use_vocals = sys.argv[4] == "Y" + utt2spk_music, utt2wav_music = prepare_music(in_dir, fs, use_vocals) + utt2spk_speech, utt2wav_speech = prepare_speech(in_dir, fs) + utt2spk_noise, utt2wav_noise = prepare_noise(in_dir, fs) + utt2spk = utt2spk_speech + utt2spk_music + utt2spk_noise + utt2wav = utt2wav_speech + utt2wav_music + utt2wav_noise + wav_fi = open(os.path.join(out_dir, "wav.scp"), "w") + wav_fi.write(utt2wav) + utt2spk_fi = open(os.path.join(out_dir, "utt2spk"), "w") + utt2spk_fi.write(utt2spk) -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/egs/voxceleb/v1/local/make_some_figs.py b/egs/voxceleb/v1/local/make_some_figs.py index 0f03919c..207cab20 100755 --- a/egs/voxceleb/v1/local/make_some_figs.py +++ b/egs/voxceleb/v1/local/make_some_figs.py @@ -9,258 +9,428 @@ import pandas as pd from hyperion.hyp_defs import float_cpu, config_logger -from hyperion.metrics.verification_evaluator import VerificationAdvAttackEvaluator as Eval - -filenames = ['voxceleb1_attack_tar_snr_results.csv', - 'voxceleb1_attack_non_snr_results.csv', - 'voxceleb1_attack_tar_linf_results.csv', - 'voxceleb1_attack_non_linf_results.csv'] - -output_dir='exp/figs/resnet34_1/' -base_res_dir = 'exp/scores/' - - - -def plot_figs1(res_dirs1, legends, title_base, fig_base, fmt=['b','r','g','m','c','y'], clean_ref=0): +from hyperion.metrics.verification_evaluator import ( + VerificationAdvAttackEvaluator as Eval, +) + +filenames = [ + "voxceleb1_attack_tar_snr_results.csv", + "voxceleb1_attack_non_snr_results.csv", + "voxceleb1_attack_tar_linf_results.csv", + "voxceleb1_attack_non_linf_results.csv", +] + +output_dir = "exp/figs/resnet34_1/" +base_res_dir = "exp/scores/" + + +def plot_figs1( + res_dirs1, + legends, + title_base, + fig_base, + fmt=["b", "r", "g", "m", "c", "y"], + clean_ref=0, +): df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[0]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[0]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends= legends, fmt=fmt, - title=title_base + ' attacks on target trials', - font_size=13) + fig_file = output_dir + fig_base + "_tar_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on target trials", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[1]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[1]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends=legends, fmt=fmt, - title=title_base + ' attacks on non-target trials', - font_size=13) - + fig_file = output_dir + fig_base + "_non_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on non-target trials", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[2]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[2]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' attacks on target trials', - font_size=13) + fig_file = output_dir + fig_base + "_tar_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on target trials", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[3]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[3]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' attacks on non-target trials', - font_size=13) - - -def plot_figs2(res_dirs1, legends, title_base, fig_base, fmt=['b','r','g','m','c','y'], clean_ref=0, colors=None): + fig_file = output_dir + fig_base + "_non_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " attacks on non-target trials", + font_size=13, + ) + + +def plot_figs2( + res_dirs1, + legends, + title_base, + fig_base, + fmt=["b", "r", "g", "m", "c", "y"], + clean_ref=0, + colors=None, +): df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[0]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[0]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends= legends, fmt=fmt, - title=title_base + ' Adv. Evasion', - font_size=13, colors=colors) + fig_file = output_dir + fig_base + "_tar_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Evasion", + font_size=13, + colors=colors, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[1]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[1]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_snr' - Eval.plot_dcf_eer_vs_stat_v2(df, 'snr', fig_file, clean_ref=clean_ref, - xlabel='SNR(dB)', higher_better=True, - legends=legends, fmt=fmt, - title=title_base + ' Adv. Impersonation', - font_size=13, colors=colors) - + fig_file = output_dir + fig_base + "_non_snr" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "snr", + fig_file, + clean_ref=clean_ref, + xlabel="SNR(dB)", + higher_better=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Impersonation", + font_size=13, + colors=colors, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[2]) - df_i = pd.read_csv(file_path,index_col=0) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[2]) + df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_tar_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' Adv. Evasion', - font_size=13) + fig_file = output_dir + fig_base + "_tar_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Evasion", + font_size=13, + ) df = [] for i in range(len(res_dirs1)): - file_path='%s/%s/%s' %(base_res_dir, res_dirs1[i], filenames[3]) + file_path = "%s/%s/%s" % (base_res_dir, res_dirs1[i], filenames[3]) df_i = pd.read_csv(file_path, index_col=0) df.append(df_i) - fig_file = output_dir + fig_base + '_non_linf' - Eval.plot_dcf_eer_vs_stat_v2(df, 'n_linf', fig_file, clean_ref=clean_ref, - xlabel=r'$L_{\infty}$', log_x=True, - legends=legends, fmt=fmt, - title=title_base + ' Adv. Impersonation', - font_size=13) - + fig_file = output_dir + fig_base + "_non_linf" + Eval.plot_dcf_eer_vs_stat_v2( + df, + "n_linf", + fig_file, + clean_ref=clean_ref, + xlabel=r"$L_{\infty}$", + log_x=True, + legends=legends, + fmt=fmt, + title=title_base + " Adv. Impersonation", + font_size=13, + ) if __name__ == "__main__": - if not os.path.isdir(output_dir): os.makedirs(output_dir) - res_dirs0= 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2' - res_dirs1 = ['cosine_fgsm_eall', 'cosine_randfgsm_eall', 'cosine_iterfgsm_eall'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - legends = ['FGSM', 'Rand-FGSM', 'Iter-FGSM'] - plot_figs1(res_dirs1, legends, 'FGSM', 'fgsm') - plot_figs2(res_dirs1, legends, 'FGSM', 'fgsm2') - plot_figs2(res_dirs1, None, 'FGSM', 'fgsmnoleg2') - - res_dirs1 = ['cosine_cwl2_conf0', 'cosine_cwl2_conf1', 'cosine_cwlinf_conf0', 'cosine_cwlinf_conf1'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - legends = ['CW-L2 conf=0', 'CW-L2 conf=1', 'CW-Linf conf=0', 'CW-Linf conf=1'] - plot_figs1(res_dirs1, legends, 'Carlini-Wagner', 'cw') - - + res_dirs0 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2" + res_dirs1 = ["cosine_fgsm_eall", "cosine_randfgsm_eall", "cosine_iterfgsm_eall"] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + legends = ["FGSM", "Rand-FGSM", "Iter-FGSM"] + plot_figs1(res_dirs1, legends, "FGSM", "fgsm") + plot_figs2(res_dirs1, legends, "FGSM", "fgsm2") + plot_figs2(res_dirs1, None, "FGSM", "fgsmnoleg2") + + res_dirs1 = [ + "cosine_cwl2_conf0", + "cosine_cwl2_conf1", + "cosine_cwlinf_conf0", + "cosine_cwlinf_conf1", + ] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + legends = ["CW-L2 conf=0", "CW-L2 conf=1", "CW-Linf conf=0", "CW-Linf conf=1"] + plot_figs1(res_dirs1, legends, "Carlini-Wagner", "cw") ########################### - - res_dirs2 = ['resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2', - 'lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2', - 'resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2'] - legends = ['ResNet34', 'ThinResNet34', 'ResETDNN'] - res_dirs3 = [s + '/cosine_iterfgsm_eall' for s in res_dirs2] - plot_figs1(res_dirs3, legends, 'Iter-FGSM', 'iterfgsm', clean_ref=None) - plot_figs2(res_dirs3, legends, 'Iter-FGSM', 'iterfgsm2', clean_ref=None) - plot_figs2(res_dirs3, None, 'Iter-FGSM', 'iterfgsmnoleg2', clean_ref=None) - - res_dirs3 = [s + '/cosine_cwl2_conf0' for s in res_dirs2] - plot_figs1(res_dirs3, legends, 'Carlini-Wagner L2', 'cwl2', clean_ref=None) - plot_figs2(res_dirs3, legends, 'Carlini-Wagner L2', 'cwl22', clean_ref=None) + res_dirs2 = [ + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2", + "lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2", + "resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2", + ] + legends = ["ResNet34", "ThinResNet34", "ResETDNN"] + res_dirs3 = [s + "/cosine_iterfgsm_eall" for s in res_dirs2] + plot_figs1(res_dirs3, legends, "Iter-FGSM", "iterfgsm", clean_ref=None) + plot_figs2(res_dirs3, legends, "Iter-FGSM", "iterfgsm2", clean_ref=None) + plot_figs2(res_dirs3, None, "Iter-FGSM", "iterfgsmnoleg2", clean_ref=None) + + res_dirs3 = [s + "/cosine_cwl2_conf0" for s in res_dirs2] + plot_figs1(res_dirs3, legends, "Carlini-Wagner L2", "cwl2", clean_ref=None) + plot_figs2(res_dirs3, legends, "Carlini-Wagner L2", "cwl22", clean_ref=None) ########################### - - res_dirs1 = ['cosine_cwl2_conf0', 'cosine_cwl2_conf0_noabort', 'cosine_cwl2_conf0_lr0.001', 'cosine_cwl2_conf0_lr0.001_noabort', - 'cosine_cwl2_conf0_lr0.001_noabort_it20', 'cosine_cwl2_conf0_lr0.001_noabort_it40', 'cosine_cwl2_conf0_lr0.001_noabort_it80', - 'cosine_cwl2_conf0_lr0.001_it80'] - legends = ['default', 'lr=0.01 it10', 'lr=0.001 it10 abort early', 'lr=0.001 it10', 'lr=0.001 it20', 'lr=0.001 it40', 'lr=0.001 it80', - 'lr=0.001 it80 abort early'] - fmt=['b', 'r', 'g', 'm','c','y','*b','*r', '*g', '*m', '*c', '*y'] - - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs1] - - plot_figs1(res_dirs2, legends, 'Carlini-Wagner L2', 'cwl2_iters1', fmt=fmt) - - res_dirs1 = ['cosine_cwl2_conf0', 'cosine_cwl2_conf0_lr0.001_noabort', - 'cosine_cwl2_conf0_lr0.001_noabort_it20', 'cosine_cwl2_conf0_lr0.001_noabort_it40', 'cosine_cwl2_conf0_lr0.001_noabort_it80', - 'cosine_cwl2_conf0_lr0.001_it80'] - legends = ['default', 'lr=0.001 it10', 'lr=0.001 it20', 'lr=0.001 it40', 'lr=0.001 it80', - 'lr=0.001 it80 abort early'] - fmt=['b', 'r','g','m','c','y'] - - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs1] - - plot_figs1(res_dirs2, legends, 'Carlini-Wagner L2', 'cwl2_iters2', fmt=fmt) + res_dirs1 = [ + "cosine_cwl2_conf0", + "cosine_cwl2_conf0_noabort", + "cosine_cwl2_conf0_lr0.001", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwl2_conf0_lr0.001_noabort_it20", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + "cosine_cwl2_conf0_lr0.001_noabort_it80", + "cosine_cwl2_conf0_lr0.001_it80", + ] + legends = [ + "default", + "lr=0.01 it10", + "lr=0.001 it10 abort early", + "lr=0.001 it10", + "lr=0.001 it20", + "lr=0.001 it40", + "lr=0.001 it80", + "lr=0.001 it80 abort early", + ] + fmt = ["b", "r", "g", "m", "c", "y", "*b", "*r", "*g", "*m", "*c", "*y"] + + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs1] + + plot_figs1(res_dirs2, legends, "Carlini-Wagner L2", "cwl2_iters1", fmt=fmt) + + res_dirs1 = [ + "cosine_cwl2_conf0", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwl2_conf0_lr0.001_noabort_it20", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + "cosine_cwl2_conf0_lr0.001_noabort_it80", + "cosine_cwl2_conf0_lr0.001_it80", + ] + legends = [ + "default", + "lr=0.001 it10", + "lr=0.001 it20", + "lr=0.001 it40", + "lr=0.001 it80", + "lr=0.001 it80 abort early", + ] + fmt = ["b", "r", "g", "m", "c", "y"] + + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs1] + + plot_figs1(res_dirs2, legends, "Carlini-Wagner L2", "cwl2_iters2", fmt=fmt) ########################### - - res_dirs0= 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2' - res_dirs1 = ['cosine_fgsm_eall', 'cosine_randfgsm_eall', 'cosine_iterfgsm_eall', - 'cosine_cwl2_conf0_lr0.001_noabort', 'cosine_cwsnr_conf0_lr0.001_noabort_it10', - 'cosine_cwrms_conf0_lr0.001_noabort_it10', 'cosine_cwrms_conf4_lr0.001_noabort_it10', 'cosine_cwl2_conf0_lr0.001_noabort_it40'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - fmt=['ob', 'vr','^g','>y','sm','pc', 'Pc', '*r', '+g', 'Dc', 'Hm'] - fmt=['ob', 'vr','^g','>y','sm','pc', 'P', '*', '+g', 'Dc', 'Hm'] - colors=['b', 'r','g','y','m','c', 'lime', 'orange', '+g', 'Dc', 'Hm'] - legends = ['FGSM', 'Rand-FGSM', 'Iter-FGSM', 'CW-L2 k=0', 'CW-SNR k=0', - 'CW-RMS k=0', 'CW-RMS k=4', 'CW-RMS k=0 it=40'] - legends = ['FGSM', 'Rand-FGSM', 'Iter-FGSM', 'CW-L2', 'CW-SNR', - 'CW-RMS', 'CW-RMS k=4', 'CW-RMS it=40'] - - plot_figs1(res_dirs1, legends, '', 'fgsmcw', fmt=fmt) - plot_figs1(res_dirs1, None, '', 'fgsmcwnoleg', fmt=fmt) - plot_figs2(res_dirs1, legends, '', 'fgsmcw2', fmt=fmt, colors=colors) - plot_figs2(res_dirs1, None, '', 'fgsmcwnoleg2', fmt=fmt, colors=colors) - - - res_dirs0= 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2' - res_dirs1 = ['cosine_iterfgsm_eall', - 'cosine_cwl2_conf0_lr0.001_noabort', 'cosine_cwsnr_conf0_lr0.001_noabort_it10', - 'cosine_cwrms_conf0_lr0.001_noabort_it10', 'cosine_cwrms_conf4_lr0.001_noabort_it10', 'cosine_cwl2_conf0_lr0.001_noabort_it40'] - res_dirs1 = [res_dirs0 + '/' + s for s in res_dirs1] - fmt=['ob', 'vr','^g','>y','sm','pc', 'Pc', '*r', '+g', 'Dc', 'Hm'] - fmt=['ob', 'vr','^g','>y','sm','pc', 'P', '*', '+g', 'Dc', 'Hm'] - colors=['b', 'r','g','y','m','c', 'lime', 'orange', '+g', 'Dc', 'Hm'] - legends = ['Iter-FGSM', 'CW-L2', 'CW-SNR', - 'CW-RMS', 'CW-RMS k=4', 'CW-RMS it=40'] - - plot_figs2(res_dirs1, legends, '', 'fgsmcw3', fmt=fmt, colors=colors) - plot_figs2(res_dirs1, None, '', 'fgsmcwnoleg3', fmt=fmt, colors=colors) - + res_dirs0 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2" + res_dirs1 = [ + "cosine_fgsm_eall", + "cosine_randfgsm_eall", + "cosine_iterfgsm_eall", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwsnr_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf4_lr0.001_noabort_it10", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + ] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "Pc", "*r", "+g", "Dc", "Hm"] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "P", "*", "+g", "Dc", "Hm"] + colors = ["b", "r", "g", "y", "m", "c", "lime", "orange", "+g", "Dc", "Hm"] + legends = [ + "FGSM", + "Rand-FGSM", + "Iter-FGSM", + "CW-L2 k=0", + "CW-SNR k=0", + "CW-RMS k=0", + "CW-RMS k=4", + "CW-RMS k=0 it=40", + ] + legends = [ + "FGSM", + "Rand-FGSM", + "Iter-FGSM", + "CW-L2", + "CW-SNR", + "CW-RMS", + "CW-RMS k=4", + "CW-RMS it=40", + ] + + plot_figs1(res_dirs1, legends, "", "fgsmcw", fmt=fmt) + plot_figs1(res_dirs1, None, "", "fgsmcwnoleg", fmt=fmt) + plot_figs2(res_dirs1, legends, "", "fgsmcw2", fmt=fmt, colors=colors) + plot_figs2(res_dirs1, None, "", "fgsmcwnoleg2", fmt=fmt, colors=colors) + + res_dirs0 = "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2" + res_dirs1 = [ + "cosine_iterfgsm_eall", + "cosine_cwl2_conf0_lr0.001_noabort", + "cosine_cwsnr_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf0_lr0.001_noabort_it10", + "cosine_cwrms_conf4_lr0.001_noabort_it10", + "cosine_cwl2_conf0_lr0.001_noabort_it40", + ] + res_dirs1 = [res_dirs0 + "/" + s for s in res_dirs1] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "Pc", "*r", "+g", "Dc", "Hm"] + fmt = ["ob", "vr", "^g", ">y", "sm", "pc", "P", "*", "+g", "Dc", "Hm"] + colors = ["b", "r", "g", "y", "m", "c", "lime", "orange", "+g", "Dc", "Hm"] + legends = ["Iter-FGSM", "CW-L2", "CW-SNR", "CW-RMS", "CW-RMS k=4", "CW-RMS it=40"] + + plot_figs2(res_dirs1, legends, "", "fgsmcw3", fmt=fmt, colors=colors) + plot_figs2(res_dirs1, None, "", "fgsmcwnoleg3", fmt=fmt, colors=colors) ########################### - res_dirs1 = ['cosine_iterfgsm_eall', 'cosine_iterfgsm_eall_randsmooth0.001', 'cosine_iterfgsm_eall_randsmooth0.01'] - legends = ['no-def', '$\sigma=32$', '$\sigma=327$'] - fmt=['b', 'r','g','m','c','y'] - - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs1] - - plot_figs1(res_dirs2, legends, 'IterFGSM RandSmooth', 'iterfgsm_randsmooth', fmt=fmt) - plot_figs2(res_dirs2, legends, 'IterFGSM RandSmooth', 'iterfgsm_randsmooth2', fmt=fmt) - plot_figs2(res_dirs2, None, 'IterFGSM RandSmooth', 'iterfgsm_randsmoothnoleg2', fmt=fmt) + res_dirs1 = [ + "cosine_iterfgsm_eall", + "cosine_iterfgsm_eall_randsmooth0.001", + "cosine_iterfgsm_eall_randsmooth0.01", + ] + legends = ["no-def", "$\sigma=32$", "$\sigma=327$"] + fmt = ["b", "r", "g", "m", "c", "y"] + + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs1] + + plot_figs1( + res_dirs2, legends, "IterFGSM RandSmooth", "iterfgsm_randsmooth", fmt=fmt + ) + plot_figs2( + res_dirs2, legends, "IterFGSM RandSmooth", "iterfgsm_randsmooth2", fmt=fmt + ) + plot_figs2( + res_dirs2, None, "IterFGSM RandSmooth", "iterfgsm_randsmoothnoleg2", fmt=fmt + ) ########################### - res_dirs2 = ['resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2', - 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1_ep5', - 'resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1'] - legends = ['No-adv', 'Adv. epoch=5', 'Adv. epoch=23'] - res_dirs3 = [s + '/cosine_fgsm_eall' for s in res_dirs2] - plot_figs1(res_dirs3, legends, 'FGSM adv. finetuning', 'fgsm_advft', clean_ref=None) + res_dirs2 = [ + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2", + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1_ep5", + "resnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2.advft_400_400_sgdcos_lr0.005_b256_attack_p0.5eps1step0.125_amp.v1", + ] + legends = ["No-adv", "Adv. epoch=5", "Adv. epoch=23"] + res_dirs3 = [s + "/cosine_fgsm_eall" for s in res_dirs2] + plot_figs1(res_dirs3, legends, "FGSM adv. finetuning", "fgsm_advft", clean_ref=None) ########################### - res_dirs1 = ['cosine_fgsm_eall', 'cosine_randfgsm_eall', 'cosine_iterfgsm_eall', - 'cosine_cwl2_conf0', 'cosine_cwl2_conf1', 'cosine_cwlinf_conf0', 'cosine_cwlinf_conf1'] - names = ['FGSM', 'Rand-FGSM', 'Iter-FGSM', - 'CW-L2 conf=0', 'CW-L2 conf=1', 'CW-Linf conf=0', 'CW-Linf conf=1'] - fig_names = ['fgsm', 'randfgsm', 'iterfgsm', 'cwl2_conf0', 'cwl2_conf1', 'cwlinf_conf0', 'cwlinf_conf1'] - legends = ['ResNet34 (white-box)', 'ThinResNet34', 'ResETDNN'] - fmt=['b','r','g','m','c','y'] + res_dirs1 = [ + "cosine_fgsm_eall", + "cosine_randfgsm_eall", + "cosine_iterfgsm_eall", + "cosine_cwl2_conf0", + "cosine_cwl2_conf1", + "cosine_cwlinf_conf0", + "cosine_cwlinf_conf1", + ] + names = [ + "FGSM", + "Rand-FGSM", + "Iter-FGSM", + "CW-L2 conf=0", + "CW-L2 conf=1", + "CW-Linf conf=0", + "CW-Linf conf=1", + ] + fig_names = [ + "fgsm", + "randfgsm", + "iterfgsm", + "cwl2_conf0", + "cwl2_conf1", + "cwlinf_conf0", + "cwlinf_conf1", + ] + legends = ["ResNet34 (white-box)", "ThinResNet34", "ResETDNN"] + fmt = ["b", "r", "g", "m", "c", "y"] for i in range(len(names)): - res_dirs2 = [res_dirs1[i], 'transfer.lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2/' + res_dirs1[i], - 'transfer.resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2/' + res_dirs1[i]] - res_dirs2 = [res_dirs0 + '/' + s for s in res_dirs2] - plot_figs1(res_dirs2, legends, names[i] + ' black-box', fig_names[i] + '_bbox', fmt=fmt) - plot_figs2(res_dirs2, legends, names[i] + ' black-box', fig_names[i] + '_bbox2', fmt=fmt) - - + res_dirs2 = [ + res_dirs1[i], + "transfer.lresnet34_zir_e256_arc0.3_do0_adam_lr0.05_b512.v2/" + + res_dirs1[i], + "transfer.resetdnn_nl5ld512_e256_arcs30m0.3_do0.1_adam_lr0.05_b512_amp.v2/" + + res_dirs1[i], + ] + res_dirs2 = [res_dirs0 + "/" + s for s in res_dirs2] + plot_figs1( + res_dirs2, legends, names[i] + " black-box", fig_names[i] + "_bbox", fmt=fmt + ) + plot_figs2( + res_dirs2, + legends, + names[i] + " black-box", + fig_names[i] + "_bbox2", + fmt=fmt, + ) diff --git a/egs/voxceleb/v1/local/make_trials_subset.py b/egs/voxceleb/v1/local/make_trials_subset.py index 585e28d7..da230842 100755 --- a/egs/voxceleb/v1/local/make_trials_subset.py +++ b/egs/voxceleb/v1/local/make_trials_subset.py @@ -4,52 +4,58 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import logging import numpy as np from hyperion.hyp_defs import float_cpu, config_logger from hyperion.utils import SparseTrialKey + def make_trials(in_key_file, out_key_file, ntar, nnon, seed): - + rng = np.random.RandomState(seed=seed) - logging.info('Load key: %s' % in_key_file) + logging.info("Load key: %s" % in_key_file) key = SparseTrialKey.load_txt(in_key_file) - + nz_idx = key.tar.nonzero() nnz = len(nz_idx[0]) p = rng.permutation(nnz)[ntar:] - nz_idx = (nz_idx[0][p], nz_idx[1][p]) + nz_idx = (nz_idx[0][p], nz_idx[1][p]) key.tar[nz_idx] = False - + nz_idx = key.non.nonzero() nnz = len(nz_idx[0]) p = rng.permutation(nnz)[nnon:] - nz_idx = (nz_idx[0][p], nz_idx[1][p]) - key.non[nz_idx] = False + nz_idx = (nz_idx[0][p], nz_idx[1][p]) + key.non[nz_idx] = False - logging.info('Saving key: %s' % out_key_file) + logging.info("Saving key: %s" % out_key_file) key.save_txt(out_key_file) if __name__ == "__main__": - parser = ArgumentParser( - description='Makes a subset of a trial key') - - parser.add_argument('--in-key-file', required=True) - parser.add_argument('--out-key-file', required=True) - parser.add_argument('--ntar', required=True, type=int) - parser.add_argument('--nnon', required=True, type=int) - parser.add_argument('--seed', default=112358, type=int) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = ArgumentParser(description="Makes a subset of a trial key") + + parser.add_argument("--in-key-file", required=True) + parser.add_argument("--out-key-file", required=True) + parser.add_argument("--ntar", required=True, type=int) + parser.add_argument("--nnon", required=True, type=int) + parser.add_argument("--seed", default=112358, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + make_trials(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/local/score_dcf.py b/egs/voxceleb/v1/local/score_dcf.py index e1982fed..9858583d 100755 --- a/egs/voxceleb/v1/local/score_dcf.py +++ b/egs/voxceleb/v1/local/score_dcf.py @@ -19,16 +19,18 @@ def score_dcf(key_file, score_file, output_path): - logging.info('Load key: %s' % key_file) + logging.info("Load key: %s" % key_file) key = SparseTrialKey.load_txt(key_file) - logging.info('Load scores: %s' % score_file) + logging.info("Load scores: %s" % score_file) scr = SparseTrialScores.load_txt(score_file) - logging.info('separating tar/non') + logging.info("separating tar/non") tar, non = scr.get_tar_non(key) - logging.info('computing EER/DCF') - priors = np.array([0.001, 0.005, 0.01, 0.05 ]) - min_dcf, act_dcf, eer, _, min_pmiss, min_pfa, act_pmiss, act_pfa = fast_eval(tar, non, priors, return_probs=True) - + logging.info("computing EER/DCF") + priors = np.array([0.001, 0.005, 0.01, 0.05]) + min_dcf, act_dcf, eer, _, min_pmiss, min_pfa, act_pmiss, act_pfa = fast_eval( + tar, non, priors, return_probs=True + ) + output_dir = os.path.dirname(output_path) if not os.path.isdir(output_dir): os.makedirs(output_dir) @@ -36,42 +38,51 @@ def score_dcf(key_file, score_file, output_path): ntar = len(tar) nnon = len(non) - output_file = output_path + '_results' - with open(output_file, 'w') as f: - s = 'EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n'.format( - eer * 100, min_dcf[3], act_dcf[3], - min_dcf[2], act_dcf[2], - min_dcf[1], act_dcf[1], - min_dcf[0], act_dcf[0], - ntar, nnon) + output_file = output_path + "_results" + with open(output_file, "w") as f: + s = "EER: {0:.2f} DCF5e-2: {1:.3f} / {2:.3f} DCF1e-2: {3:.3f} / {4:.3f} DCF5e-3: {5:.3f} / {6:.3f} DCF1e-3: {7:.3f} / {8:.3f} ntar: {9:d} nnon: {10:d}\n".format( + eer * 100, + min_dcf[3], + act_dcf[3], + min_dcf[2], + act_dcf[2], + min_dcf[1], + act_dcf[1], + min_dcf[0], + act_dcf[0], + ntar, + nnon, + ) f.write(s) logging.info(s) - s = 'min-pmiss={} min-pfa={} act-pmiss={} act-pfa={}'.format( - min_pmiss, min_pfa, act_pmiss, act_pfa) + s = "min-pmiss={} min-pfa={} act-pmiss={} act-pfa={}".format( + min_pmiss, min_pfa, act_pmiss, act_pfa + ) logging.info(s) - s = 'min-Nmiss={} min-Nfa={} act-Nmiss={} act-Nfa={}'.format( - min_pmiss * ntar, min_pfa * nnon, act_pmiss * ntar, act_pfa * nnon) + s = "min-Nmiss={} min-Nfa={} act-Nmiss={} act-Nfa={}".format( + min_pmiss * ntar, min_pfa * nnon, act_pmiss * ntar, act_pfa * nnon + ) logging.info(s) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Computes EER and DCF') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Computes EER and DCF", + ) + + parser.add_argument("--key-file", required=True) + parser.add_argument("--score-file", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - parser.add_argument('--key-file', required=True) - parser.add_argument('--score-file', required=True) - parser.add_argument('--output-path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - score_dcf(**vars(args)) - + score_dcf(**vars(args)) diff --git a/egs/voxceleb/v1/steps_be/eval-be-v1.py b/egs/voxceleb/v1/steps_be/eval-be-v1.py index 0cb7217f..c88b05fc 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v1.py +++ b/egs/voxceleb/v1/steps_be/eval-be-v1.py @@ -8,7 +8,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -21,67 +26,84 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, - model_part_idx, num_model_parts, - seg_part_idx, num_seg_parts, - **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc, - model_part_idx, num_model_parts, seg_part_idx, num_seg_parts) + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) x_e, x_t, enroll, ndx = tdr.read() - logging.info('loading plda model: %s' % (model_file)) + logging.info("loading plda model: %s" % (model_file)) model = F.load_plda(plda_type, model_file) - + t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) if num_model_parts > 1 or num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, model_part_idx, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) s.save_txt(score_file) - if __name__ == "__main__": - parser=ArgumentParser( - description='Eval PLDA') + parser = ArgumentParser(description="Eval PLDA") - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**namespace_to_dict(args)) - - diff --git a/egs/voxceleb/v1/steps_be/eval-be-v2.py b/egs/voxceleb/v1/steps_be/eval-be-v2.py index 41f0cd1c..0438e373 100755 --- a/egs/voxceleb/v1/steps_be/eval-be-v2.py +++ b/egs/voxceleb/v1/steps_be/eval-be-v2.py @@ -6,7 +6,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -20,61 +25,78 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, score_file, - model_part_idx, num_model_parts, - seg_part_idx, num_seg_parts, **kwargs): - - logging.info('loading data') +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + score_file, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + **kwargs +): + + logging.info("loading data") if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - tdr = TDR(iv_file, ndx_file, enroll_file, test_file, preproc, - model_part_idx, num_model_parts, seg_part_idx, num_seg_parts) + tdr = TDR( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc, + model_part_idx, + num_model_parts, + seg_part_idx, + num_seg_parts, + ) x_e, x_t, enroll, ndx = tdr.read() t1 = time.time() - logging.info('computing llr') + logging.info("computing llr") scores = cosine_scoring(x_e, x_t) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "scoring elapsed time: %.2f s. elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) if num_model_parts > 1 or num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, model_part_idx, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) + score_file = "%s-%03d-%03d" % (score_file, model_part_idx, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) s = TrialScores(enroll, ndx.seg_set, scores, score_mask=ndx.trial_mask) s.save_txt(score_file) - if __name__ == "__main__": - parser=ArgumentParser( - description='Eval cosine-scoring') + parser = ArgumentParser(description="Eval cosine-scoring") - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) assert args.test_file is not None or args.ndx_file is not None eval_plda(**namespace_to_dict(args)) - - diff --git a/egs/voxceleb/v1/steps_be/eval-calibration-v1.py b/egs/voxceleb/v1/steps_be/eval-calibration-v1.py index 38250797..bf252f60 100755 --- a/egs/voxceleb/v1/steps_be/eval-calibration-v1.py +++ b/egs/voxceleb/v1/steps_be/eval-calibration-v1.py @@ -8,7 +8,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -24,43 +29,41 @@ def eval_calibration(in_score_file, ndx_file, model_file, out_score_file): - logging.info('load ndx: %s' % ndx_file) + logging.info("load ndx: %s" % ndx_file) try: ndx = TrialNdx.load_txt(ndx_file) except: ndx = TrialKey.load_txt(ndx_file) - - logging.info('load scores: %s' % in_score_file) + + logging.info("load scores: %s" % in_score_file) scr = TrialScores.load_txt(in_score_file) scr = scr.align_with_ndx(ndx) - logging.info('load model: %s' % model_file) + logging.info("load model: %s" % model_file) lr = LR.load(model_file) - logging.info('apply calibration') + logging.info("apply calibration") s_cal = lr.predict(scr.scores.ravel()) scr.scores = np.reshape(s_cal, scr.scores.shape) - logging.info('save scores: %s' % out_score_file) + logging.info("save scores: %s" % out_score_file) scr.save_txt(out_score_file) - - + + if __name__ == "__main__": - parser=ArgumentParser( - description='Evals linear calibration') + parser = ArgumentParser(description="Evals linear calibration") - parser.add_argument('--in-score-file', dest='in_score_file', required=True) - parser.add_argument('--out-score-file', dest='out_score_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--in-score-file", dest="in_score_file", required=True) + parser.add_argument("--out-score-file", dest="out_score_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_calibration(**namespace_to_dict(args)) - + eval_calibration(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train-be-v2.py b/egs/voxceleb/v1/steps_be/train-be-v2.py index b5f60b11..1d72df93 100755 --- a/egs/voxceleb/v1/steps_be/train-be-v2.py +++ b/egs/voxceleb/v1/steps_be/train-be-v2.py @@ -6,7 +6,12 @@ import logging import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import numpy as np @@ -17,8 +22,8 @@ from numpy.linalg import matrix_rank -def train_be(iv_file, train_list, - output_path, **kwargs): + +def train_be(iv_file, train_list, output_path, **kwargs): # Read data vr_args = VR.filter_args(**kwargs) @@ -31,18 +36,18 @@ def train_be(iv_file, train_list, pca = None if rank < x.shape[1]: # do PCA if rank of x is smaller than its dimension - pca = PCA(pca_dim=rank, name='pca') + pca = PCA(pca_dim=rank, name="pca") pca.fit(x) x = pca.predict(x) - logging.info('PCA rank=%d' % (rank)) + logging.info("PCA rank=%d" % (rank)) # Train centering and whitening t1 = time.time() - cw = CentWhiten(name='cw') + cw = CentWhiten(name="cw") cw.fit(x) - logging.info('PCA-CW Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("PCA-CW Elapsed time: %.2f s." % (time.time() - t1)) + # Save models if pca is None: preproc = TransformList([cw]) @@ -52,27 +57,26 @@ def train_be(iv_file, train_list, if not os.path.exists(output_path): os.makedirs(ouput_path) - preproc.save(output_path + '/cw.h5') - - + preproc.save(output_path + "/cw.h5") + + if __name__ == "__main__": - parser=ArgumentParser( - description='Train Back-end') + parser = ArgumentParser(description="Train Back-end") + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - VR.add_argparse_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_be(**namespace_to_dict(args)) - + train_be(**namespace_to_dict(args)) diff --git a/egs/voxceleb/v1/steps_be/train-calibration-v1.py b/egs/voxceleb/v1/steps_be/train-calibration-v1.py index e527012b..7408fd1d 100755 --- a/egs/voxceleb/v1/steps_be/train-calibration-v1.py +++ b/egs/voxceleb/v1/steps_be/train-calibration-v1.py @@ -8,7 +8,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -23,62 +28,67 @@ def train_calibration(score_file, key_file, model_file, prior, lambda_reg, verbose): - logging.info('load key: %s' % key_file) + logging.info("load key: %s" % key_file) key = TrialKey.load_txt(key_file) - logging.info('load scores: %s' % score_file) + logging.info("load scores: %s" % score_file) scr = TrialScores.load_txt(score_file) tar, non = scr.get_tar_non(key) ntar = len(tar) nnon = len(non) min_dcf, p_miss, p_fa = compute_min_dcf(tar, non, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (min_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "min_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (min_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) - logging.info('train calibration') + logging.info("train calibration") x = np.concatenate((tar, non)) - y = np.concatenate((np.ones((ntar,), dtype='int32'), - np.zeros((nnon,), dtype='int32'))) - lr = LR(prior=prior, lambda_reg=lambda_reg, bias_scaling=1, solver='liblinear', verbose=verbose) + y = np.concatenate( + (np.ones((ntar,), dtype="int32"), np.zeros((nnon,), dtype="int32")) + ) + lr = LR( + prior=prior, + lambda_reg=lambda_reg, + bias_scaling=1, + solver="liblinear", + verbose=verbose, + ) lr.fit(x, y) print(lr.A) print(lr.b) - logging.info('save calibration at %s' % model_file) + logging.info("save calibration at %s" % model_file) lr.save(model_file) - logging.info('calibrate scores') + logging.info("calibrate scores") tar_cal = lr.predict(tar) non_cal = lr.predict(non) act_dcf, p_miss, p_fa = compute_act_dcf(tar_cal, non_cal, prior) - n_miss = p_miss*ntar - n_fa = p_fa*nnon - logging.info('act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f' % - (act_dcf, p_miss*100, p_fa*100, n_miss, n_fa)) - - - - + n_miss = p_miss * ntar + n_fa = p_fa * nnon + logging.info( + "act_dcf: %.3f p_miss: %.2f p_fa: %.2f n_miss: %.1f n_fa: %.1f" + % (act_dcf, p_miss * 100, p_fa * 100, n_miss, n_fa) + ) + + if __name__ == "__main__": - parser=ArgumentParser( - description='Trains llr calibration') + parser = ArgumentParser(description="Trains llr calibration") - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--prior', dest='prior', type=float, - default=0.01) - parser.add_argument('--lambda-reg', dest='lambda_reg', type=float, - default=1e-5) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--prior", dest="prior", type=float, default=0.01) + parser.add_argument("--lambda-reg", dest="lambda_reg", type=float, default=1e-5) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - train_calibration(**namespace_to_dict(args)) - + train_calibration(**namespace_to_dict(args)) diff --git a/hyperion/__init__.py b/hyperion/__init__.py index cdf812a4..54eafbce 100644 --- a/hyperion/__init__.py +++ b/hyperion/__init__.py @@ -4,7 +4,6 @@ """ - from . import utils from . import metrics from . import pdfs @@ -13,8 +12,10 @@ from . import feats from . import calibration from . import score_norm -#from . import keras + +# from . import keras from . import helpers -#from . import generators -__version__ = '0.1.0' +# from . import generators + +__version__ = "0.1.0" diff --git a/hyperion/augment/noise_augment.py b/hyperion/augment/noise_augment.py index d6054a79..3a500b7c 100644 --- a/hyperion/augment/noise_augment.py +++ b/hyperion/augment/noise_augment.py @@ -16,28 +16,27 @@ class SingleNoiseAugment(object): - """ Class to augment speech with additive noise of a single type, + """Class to augment speech with additive noise of a single type, e.g., music, babble, ... Attributes: noise_type: string label indicating the noise type. - noise_path: path to Kaldi style wav.scp file indicating the path + noise_path: path to Kaldi style wav.scp file indicating the path to the noise wav files. min_snr: mininimum SNR(dB) to sample from. max_snr: maximum SNR(dB) to sample from. - rng: Random number generator returned by + rng: Random number generator returned by np.random.RandomState (optional) """ - def __init__(self, - noise_type, - noise_path, - min_snr, - max_snr, - random_seed=112358, - rng=None): + + def __init__( + self, noise_type, noise_path, min_snr, max_snr, random_seed=112358, rng=None + ): logging.info( - 'init noise_augment with noise={} noise_path={} snr={}-{}'.format( - noise_type, noise_path, min_snr, max_snr)) + "init noise_augment with noise={} noise_path={} snr={}-{}".format( + noise_type, noise_path, min_snr, max_snr + ) + ) self.noise_type = noise_type self.r = AR(noise_path) @@ -51,12 +50,11 @@ def __init__(self, else: self.rng = deepcopy(rng) - logging.info( - 'init noise_augment with noise={} done'.format(noise_type)) + logging.info("init noise_augment with noise={} done".format(noise_type)) @staticmethod def _power(x): - return 10 * np.log10((x**2).sum()) + return 10 * np.log10((x ** 2).sum()) @staticmethod def snr(x, n): @@ -65,7 +63,7 @@ def snr(x, n): @staticmethod def _compute_noise_scale(x, n, target_snr): snr = SingleNoiseAugment.snr(x, n) - return 10**((snr - target_snr) / 20) + return 10 ** ((snr - target_snr) / 20) def forward(self, x): num_samples = x.shape[0] @@ -91,8 +89,7 @@ def forward(self, x): need_samples = min(x.shape[0], noise_k.shape[0]) noise = noise_k[:need_samples] else: - need_samples = min(x.shape[0] - noise.shape[0], - noise_k.shape[0]) + need_samples = min(x.shape[0] - noise.shape[0], noise_k.shape[0]) noise = np.concatenate((noise, noise_k[:need_samples])) if need_samples < noise_k.shape[0]: @@ -103,7 +100,7 @@ def forward(self, x): target_snr = self.rng.uniform(self.min_snr, self.max_snr) scale = self._compute_noise_scale(x, noise, target_snr) - info = {'noise_type': self.noise_type, 'snr': target_snr} + info = {"noise_type": self.noise_type, "snr": target_snr} return x + scale * noise, info def __call__(self, x): @@ -111,7 +108,7 @@ def __call__(self, x): class NoiseAugment(object): - """ Class to augment speech with additive noise from multiple types, + """Class to augment speech with additive noise from multiple types, e.g., music, babble, ... It will randomly choose which noise type to add. @@ -120,28 +117,31 @@ class NoiseAugment(object): noise_types: dictionary of options with one entry per noise-type, Each entry is also a dictiory with the following entries: weight, max_snr, min_snr, noise_path. The weight parameter - is proportional to how often we want to sample a given noise + is proportional to how often we want to sample a given noise type. - rng: Random number generator returned by + rng: Random number generator returned by np.random.RandomState (optional) """ + def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): - logging.info('init noise augment') + logging.info("init noise augment") self.noise_prob = noise_prob assert isinstance(noise_types, dict) # num_noise_types = len(noise_types) augmenters = [] - self.weights = np.zeros((len(noise_types), )) + self.weights = np.zeros((len(noise_types),)) count = 0 for key, opts in noise_types.items(): - self.weights[count] = opts['weight'] - aug = SingleNoiseAugment(key, - opts['noise_path'], - opts['min_snr'], - opts['max_snr'], - random_seed=random_seed, - rng=rng) + self.weights[count] = opts["weight"] + aug = SingleNoiseAugment( + key, + opts["noise_path"], + opts["min_snr"], + opts["max_snr"], + random_seed=random_seed, + rng=rng, + ) augmenters.append(aug) count += 1 @@ -156,27 +156,28 @@ def __init__(self, noise_prob, noise_types, random_seed=112358, rng=None): @classmethod def create(cls, cfg, random_seed=112358, rng=None): - """ Creates a NoiseAugment object from options dictionary or YAML file. + """Creates a NoiseAugment object from options dictionary or YAML file. Args: cfg: YAML file path or dictionary with noise options. - rng: Random number generator returned by + rng: Random number generator returned by np.random.RandomState (optional) - + Returns: NoiseAugment object """ if isinstance(cfg, str): - with open(cfg, 'r') as f: + with open(cfg, "r") as f: cfg = yaml.load(f, Loader=yaml.FullLoader) - assert isinstance(cfg, - dict), ('wrong object type for cfg={}'.format(cfg)) + assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg) - return cls(noise_prob=cfg['noise_prob'], - noise_types=cfg['noise_types'], - random_seed=random_seed, - rng=rng) + return cls( + noise_prob=cfg["noise_prob"], + noise_types=cfg["noise_types"], + random_seed=random_seed, + rng=rng, + ) def forward(self, x): @@ -186,7 +187,7 @@ def forward(self, x): if p > self.noise_prob: # we don't add noise - info = {'noise_type': None, 'snr': 100} + info = {"noise_type": None, "snr": 100} return x, info # decide the noise type diff --git a/hyperion/augment/reverb_augment.py b/hyperion/augment/reverb_augment.py index ab73a196..58e4c0cb 100644 --- a/hyperion/augment/reverb_augment.py +++ b/hyperion/augment/reverb_augment.py @@ -17,16 +17,17 @@ from ..hyp_defs import float_cpu from ..io import RandomAccessDataReaderFactory as DRF + class RIRNormType(Enum): """normalization type to apply to RIR""" + NONE = 0 # none MAX = 1 # max ray normalized to 1 - ENERGY = 2 # energy of RIR normalized to 1 - - + ENERGY = 2 # energy of RIR normalized to 1 + class SingleReverbAugment(object): - """ Class to augment speech with reverberation using RIR from a + """Class to augment speech with reverberation using RIR from a single type, e.g., small room, medium room, large room Attributes: @@ -34,21 +35,32 @@ class SingleReverbAugment(object): rir_path: Kaldi style rspecifier to Ark or H5 file containing RIRs rir_norm: RIR normalization method between None, 'max' or 'energy' comp_delay: compensate the delay introduced by the RIR if any, - this delay will happen if the maximum of the RIR is not in + this delay will happen if the maximum of the RIR is not in its first sample. preload_rirs: if True all RIRS are loaded into RAM - rng: Random number generator returned by + rng: Random number generator returned by np.random.RandomState (optional) """ - def __init__(self, rir_type, rir_path, rir_norm=None, comp_delay=True, - preload_rirs=True, random_seed=112358, rng=None): + def __init__( + self, + rir_type, + rir_path, + rir_norm=None, + comp_delay=True, + preload_rirs=True, + random_seed=112358, + rng=None, + ): self.rir_type = rir_type - logging.info(('init reverb_augment with RIR={} rir_path={} ' - 'rir_norm={} comp_delay={}').format( - rir_type, rir_path, rir_norm, comp_delay)) + logging.info( + ( + "init reverb_augment with RIR={} rir_path={} " + "rir_norm={} comp_delay={}" + ).format(rir_type, rir_path, rir_norm, comp_delay) + ) self.r = DRF.create(rir_path) - #logging.info('init reverb_augment with RIR={} read RIR lengths'.format(rir_type)) + # logging.info('init reverb_augment with RIR={} read RIR lengths'.format(rir_type)) self.rir_keys = self.r.keys self.preload_rirs = preload_rirs if preload_rirs: @@ -59,11 +71,11 @@ def __init__(self, rir_type, rir_path, rir_norm=None, comp_delay=True, if rir_norm is None: self.rir_norm = RIRNormType.NONE - elif rir_norm == 'max': + elif rir_norm == "max": self.rir_norm = RIRNormType.MAX - elif rir_norm == 'energy': + elif rir_norm == "energy": self.rir_norm = RIRNormType.ENERGY - + self.comp_delay = comp_delay self.lock = multiprocessing.Lock() @@ -72,13 +84,11 @@ def __init__(self, rir_type, rir_path, rir_norm=None, comp_delay=True, else: self.rng = deepcopy(rng) - logging.info('init reverb_augment with RIR={} done'.format(rir_type)) - + logging.info("init reverb_augment with RIR={} done".format(rir_type)) @staticmethod def _power(x): - return 10 * np.log10((x**2).sum()+1e-5) - + return 10 * np.log10((x ** 2).sum() + 1e-5) @staticmethod def sdr(x, y, scale, delay): @@ -86,7 +96,6 @@ def sdr(x, y, scale, delay): n = y[delay:] - x return SingleReverbAugment._power(x) - SingleReverbAugment._power(n) - def _norm_rir(self, h): if self.rir_norm == RIRNormType.NONE: return h @@ -94,9 +103,8 @@ def _norm_rir(self, h): idx = np.argmax(np.abs(h)) return h / h[idx] - return h / np.sum(h**2) + return h / np.sum(h ** 2) - def forward(self, x): num_samples = x.shape[0] with self.lock: @@ -110,28 +118,30 @@ def forward(self, x): h = self._norm_rir(h) h_delay = np.argmax(np.abs(h)) - h_max = h[h_delay] + h_max = h[h_delay] y = signal.fftconvolve(x, h) if self.comp_delay: - y = y[h_delay:num_samples+h_delay] + y = y[h_delay : num_samples + h_delay] h_delay = 0 else: - y = y[:num_samples+h_delay] + y = y[: num_samples + h_delay] srr = self.sdr(x, y, h_max, h_delay) - #logging.info('rt={} {} {} {} {}'.format(t2-t1, t3-t2, t4-t3, t5-t4, t6-t5)) - info = {'rir_type': self.rir_type, 'srr': srr, - 'h_max': h_max, 'h_delay': h_delay} + # logging.info('rt={} {} {} {} {}'.format(t2-t1, t3-t2, t4-t3, t5-t4, t6-t5)) + info = { + "rir_type": self.rir_type, + "srr": srr, + "h_max": h_max, + "h_delay": h_delay, + } return y, info - def __call__(self, x): return self.forward(x) - class ReverbAugment(object): - """ Class to augment speech with reverberation with RIRS from multiple types, + """Class to augment speech with reverberation with RIRS from multiple types, e.g., small room, medium room, large room. It will randomly choose which RIR type to add. @@ -140,34 +150,36 @@ class ReverbAugment(object): rir_types: dictionary of options with one entry per RIR-type, Each entry is also a dictiory with the following entries: weight, rir_norm, comp_delay, rir_path. The weight parameter - is proportional to how often we want to sample a given RIR + is proportional to how often we want to sample a given RIR type. - max_reverb_context: number of samples required as left context + max_reverb_context: number of samples required as left context for the convolution operation. - rng: Random number generator returned by + rng: Random number generator returned by np.random.RandomState (optional) """ - def __init__(self, reverb_prob, rir_types, max_reverb_context=0, random_seed=112358, rng=None): - logging.info('init reverb_augment') + def __init__( + self, reverb_prob, rir_types, max_reverb_context=0, random_seed=112358, rng=None + ): + + logging.info("init reverb_augment") self.reverb_prob = reverb_prob assert isinstance(rir_types, dict) num_rir_types = len(rir_types) - + augmenters = [] self.weights = np.zeros((len(rir_types),)) count = 0 - val_opts = ('rir_path', 'rir_norm', 'comp_delay', 'preload_rirs') + val_opts = ("rir_path", "rir_norm", "comp_delay", "preload_rirs") for key, opts in rir_types.items(): - self.weights[count] = opts['weight'] - + self.weights[count] = opts["weight"] + opts_i = {} for opt_key in val_opts: if opt_key in opts: opts_i[opt_key] = opts[opt_key] - aug = SingleReverbAugment( - key, **opts_i, random_seed=random_seed, rng=rng) + aug = SingleReverbAugment(key, **opts_i, random_seed=random_seed, rng=rng) augmenters.append(aug) count += 1 @@ -184,37 +196,37 @@ def __init__(self, reverb_prob, rir_types, max_reverb_context=0, random_seed=112 else: self.rng = deepcopy(rng) - @classmethod - def create(cls, cfg, random_seed=112358, rng=None): - """ Creates a ReverbAugment object from options dictionary or YAML file. + def create(cls, cfg, random_seed=112358, rng=None): + """Creates a ReverbAugment object from options dictionary or YAML file. Args: cfg: YAML file path or dictionary with reverb options. - rng: Random number generator returned by + rng: Random number generator returned by np.random.RandomState (optional) - + Returns: ReverbAugment object """ if isinstance(cfg, str): - with open(cfg, 'r') as f: + with open(cfg, "r") as f: cfg = yaml.load(f, Loader=yaml.FullLoader) - assert isinstance(cfg, dict), ( - 'wrong object type for cfg={}'.format(cfg)) - - return cls(reverb_prob=cfg['reverb_prob'], rir_types=cfg['rir_types'], - max_reverb_context=cfg['max_reverb_context'], - random_seed=random_seed, rng=rng) + assert isinstance(cfg, dict), "wrong object type for cfg={}".format(cfg) + return cls( + reverb_prob=cfg["reverb_prob"], + rir_types=cfg["rir_types"], + max_reverb_context=cfg["max_reverb_context"], + random_seed=random_seed, + rng=rng, + ) @staticmethod def sdr(x, y, scale, delay): return SingleReverbAugment.sdr(x, y, scale, delay) - def forward(self, x): # decide whether to add reverb or not @@ -223,7 +235,7 @@ def forward(self, x): if p > self.reverb_prob: # we don't add reverb - info = {'rir_type': None, 'srr': 100, 'h_max': 1, 'h_delay': 0} + info = {"rir_type": None, "srr": 100, "h_max": 1, "h_delay": 0} return x, info # decide the RIR type @@ -234,8 +246,5 @@ def forward(self, x): x, info = self.augmenters[rir_idx](x) return x, info - def __call__(self, x): return self.forward(x) - - diff --git a/hyperion/bin/apply-mvn-select-frames.py b/hyperion/bin/apply-mvn-select-frames.py index 444bb807..71c52cda 100755 --- a/hyperion/bin/apply-mvn-select-frames.py +++ b/hyperion/bin/apply-mvn-select-frames.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -22,11 +27,22 @@ from hyperion.feats import FrameSelector as FSel -def process_feats(input_spec, output_spec, vad_spec, write_num_frames_spec, - scp_sep, path_prefix, vad_path_prefix, part_idx, num_parts, - compress, compression_method, **kwargs): +def process_feats( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + scp_sep, + path_prefix, + vad_path_prefix, + part_idx, + num_parts, + compress, + compression_method, + **kwargs +): - logging.info('initializing') + logging.info("initializing") mvn_args = MVN.filter_args(**kwargs) mvn = MVN(**mvn_args) if vad_spec is not None: @@ -36,79 +52,126 @@ def process_feats(input_spec, output_spec, vad_spec, write_num_frames_spec, if write_num_frames_spec is not None: keys = [] info = [] - - logging.info('opening output stream: %s' % (output_spec)) - with DWF.create(output_spec, - compress=compress, compression_method=compression_method, - scp_sep=scp_sep) as writer: - - logging.info('opening input stream: %s' % (output_spec)) - with DRF.create(input_spec, path_prefix=path_prefix, scp_sep=scp_sep, - part_idx=part_idx, num_parts=num_parts) as reader: + + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create( + output_spec, + compress=compress, + compression_method=compression_method, + scp_sep=scp_sep, + ) as writer: + + logging.info("opening input stream: %s" % (output_spec)) + with DRF.create( + input_spec, + path_prefix=path_prefix, + scp_sep=scp_sep, + part_idx=part_idx, + num_parts=num_parts, + ) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = RDRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep) - + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = RDRF.create( + vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + ) + while not reader.eof(): key, data = reader.read(1) if len(key) == 0: break - logging.info('processing feats at %s' % (key[0])) + logging.info("processing feats at %s" % (key[0])) x = mvn.normalize(data[0]) if vad_spec is not None: - vad = v_reader.read(key)[0].astype('bool') + vad = v_reader.read(key)[0].astype("bool") tot_frames = x.shape[0] x = fs.select(x, vad) - logging.info('for %s detected %d/%d (%.2f %%) speech frames' - % (key[0], x.shape[0], tot_frames, x.shape[0]/tot_frames*100)) - if x.shape[0]>0: + logging.info( + "for %s detected %d/%d (%.2f %%) speech frames" + % ( + key[0], + x.shape[0], + tot_frames, + x.shape[0] / tot_frames * 100, + ) + ) + if x.shape[0] > 0: writer.write(key, [x]) if write_num_frames_spec is not None: keys += key info.append(x.shape[0]) if write_num_frames_spec is not None: - logging.info('writing num-frames to %s' % (write_num_frames_spec)) + logging.info("writing num-frames to %s" % (write_num_frames_spec)) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) - - + + if __name__ == "__main__": - - parser=ArgumentParser( - description='Apply CMVN and remove silence') - - parser.add_argument('--input', dest='input_spec', required=True) - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-num-frames', dest='write_num_frames_spec', default=None) - parser.add_argument('--scp-sep', dest='scp_sep', default=' ', - help=('scp file field separator')) - parser.add_argument('--path-prefix', dest='path_prefix', default=None, - help=('scp file_path prefix')) - parser.add_argument('--vad-path-prefix', dest='vad_path_prefix', default=None, - help=('scp file_path prefix for vad')) - parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) - - parser.add_argument('--compress', dest='compress', default=False, action='store_true', help='Lossy compress the features') - parser.add_argument('--compression-method', dest='compression_method', default='auto', - choices=compression_methods, - help=('Kaldi compression method: ' - '{auto (default), speech_feat, ' - '2byte-auto, 2byte-signed-integer, ' - '1byte-auto, 1byte-unsigned-integer, 1byte-0-1}.')) + + parser = ArgumentParser(description="Apply CMVN and remove silence") + + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument( + "--scp-sep", dest="scp_sep", default=" ", help=("scp file field separator") + ) + parser.add_argument( + "--path-prefix", dest="path_prefix", default=None, help=("scp file_path prefix") + ) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + parser.add_argument( + "--part-idx", + dest="part_idx", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "--num-parts", + dest="num_parts", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + + parser.add_argument( + "--compress", + dest="compress", + default=False, + action="store_true", + help="Lossy compress the features", + ) + parser.add_argument( + "--compression-method", + dest="compression_method", + default="auto", + choices=compression_methods, + help=( + "Kaldi compression method: " + "{auto (default), speech_feat, " + "2byte-auto, 2byte-signed-integer, " + "1byte-auto, 1byte-unsigned-integer, 1byte-0-1}." + ), + ) MVN.add_argparse_args(parser) FSel.add_argparse_args(parser) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - args=parser.parse_args() + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + process_feats(**namespace_to_dict(args)) - diff --git a/hyperion/bin/compute-energy-vad.py b/hyperion/bin/compute-energy-vad.py index c0c20931..397aea80 100755 --- a/hyperion/bin/compute-energy-vad.py +++ b/hyperion/bin/compute-energy-vad.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -16,58 +21,66 @@ from hyperion.io import DataWriterFactory as DWF from hyperion.feats import EnergyVAD + def compute_vad(input_path, output_path, write_num_frames, **kwargs): vad_args = EnergyVAD.filter_args(**kwargs) vad = EnergyVAD(**vad_args) - + input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) - writer = DWF.create(output_path, scp_sep=' ') + writer = DWF.create(output_path, scp_sep=" ") if write_num_frames is not None: - f_num_frames = open(write_num_frames, 'w') + f_num_frames = open(write_num_frames, "w") for data in reader: key, x, fs = data - logging.info('Extracting VAD for %s' % (key)) + logging.info("Extracting VAD for %s" % (key)) t1 = time.time() y = vad.compute(x) - dt = (time.time() - t1)*1000 - rtf = vad.frame_shift*y.shape[0]/dt + dt = (time.time() - t1) * 1000 + rtf = vad.frame_shift * y.shape[0] / dt num_speech_frames = np.sum(y) prob_speech = num_speech_frames / y.shape[0] * 100 - logging.info('Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f' % - (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf)) + logging.info( + "Extracted VAD for %s detected %d/%d (%f %%) speech frames, elapsed-time=%.2f ms. real-time-factor=%.2f" + % (key, num_speech_frames, y.shape[0], prob_speech, dt, rtf) + ) writer.write([key], [y]) if write_num_frames is not None: - f_num_frames.write('%s %d\n' % (key, y.shape[0])) + f_num_frames.write("%s %d\n" % (key, y.shape[0])) vad.reset() - + if write_num_frames is not None: f_num_frames.close() if __name__ == "__main__": - - parser=ArgumentParser( - description='Compute Kaldi Energy VAD') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output', dest='output_path', required=True) - parser.add_argument('--write-num-frames', default=None) + parser = ArgumentParser(description="Compute Kaldi Energy VAD") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--write-num-frames", default=None) AR.add_class_args(parser) EnergyVAD.add_class_args(parser) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + compute_vad(**namespace_to_dict(args)) - diff --git a/hyperion/bin/compute-mfcc-feats.py b/hyperion/bin/compute-mfcc-feats.py index 4aab8c0b..589d3188 100755 --- a/hyperion/bin/compute-mfcc-feats.py +++ b/hyperion/bin/compute-mfcc-feats.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -18,72 +23,94 @@ from hyperion.io import compression_methods from hyperion.feats import MFCC -def compute_mfcc_feats(input_path, output_path, - compress, compression_method, - write_num_frames, **kwargs): + +def compute_mfcc_feats( + input_path, output_path, compress, compression_method, write_num_frames, **kwargs +): mfcc_args = MFCC.filter_args(**kwargs) mfcc = MFCC(**mfcc_args) - if mfcc.input_step == 'wave': + if mfcc.input_step == "wave": input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) else: input_args = DRF.filter_args(**kwargs) reader = DRF.create(input_path, **input_args) - writer = DWF.create(output_path, scp_sep=' ', - compress=compress, - compression_method=compression_method) + writer = DWF.create( + output_path, + scp_sep=" ", + compress=compress, + compression_method=compression_method, + ) if write_num_frames is not None: - f_num_frames = open(write_num_frames, 'w') - + f_num_frames = open(write_num_frames, "w") + for data in reader: - if mfcc.input_step == 'wave': + if mfcc.input_step == "wave": key, x, fs = data else: key, x = data - logging.info('Extracting MFCC for %s num_samples=%d' % (key, len(x))) + logging.info("Extracting MFCC for %s num_samples=%d" % (key, len(x))) t1 = time.time() y = mfcc.compute(x) - dt = (time.time() - t1)*1000 - rtf = dt/(mfcc.frame_shift*y.shape[0]) - logging.info('Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % - (key, y.shape[0], dt, rtf)) + dt = (time.time() - t1) * 1000 + rtf = dt / (mfcc.frame_shift * y.shape[0]) + logging.info( + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f" + % (key, y.shape[0], dt, rtf) + ) writer.write([key], [y]) - + if write_num_frames is not None: - f_num_frames.write('%s %d\n' % (key, y.shape[0])) + f_num_frames.write("%s %d\n" % (key, y.shape[0])) mfcc.reset() - + if write_num_frames is not None: f_num_frames.close() - + if __name__ == "__main__": - - parser=ArgumentParser( - description='Compute MFCC features') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output', dest='output_path', required=True) - parser.add_argument('--write-num-frames', default=None) + parser = ArgumentParser(description="Compute MFCC features") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--write-num-frames", default=None) AR.add_class_args(parser) DRF.add_class_args(parser) MFCC.add_class_args(parser) - parser.add_argument('--compress', dest='compress', default=False, action='store_true', help='Compress the features') - parser.add_argument('--compression-method', dest='compression_method', default='auto', - choices=compression_methods, help='Compression method') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument( + "--compress", + dest="compress", + default=False, + action="store_true", + help="Compress the features", + ) + parser.add_argument( + "--compression-method", + dest="compression_method", + default="auto", + choices=compression_methods, + help="Compression method", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + compute_mfcc_feats(**namespace_to_dict(args)) - diff --git a/hyperion/bin/copy-feats.py b/hyperion/bin/copy-feats.py index 2b6f2fa9..1ef044f5 100755 --- a/hyperion/bin/copy-feats.py +++ b/hyperion/bin/copy-feats.py @@ -18,22 +18,24 @@ if __name__ == "__main__": - - parser=argparse.ArgumentParser( + + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Copy features and change format') + fromfile_prefix_chars="@", + description="Copy features and change format", + ) - parser.add_argument('--input', dest='input_spec', nargs='+', required=True) - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--write-num-frames', dest='write_num_frames', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument("--input", dest="input_spec", nargs="+", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--write-num-frames", dest="write_num_frames", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) CF.add_argparse_args(parser) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + CF(**vars(args)) - diff --git a/hyperion/bin/eval-cos-1vs1.py b/hyperion/bin/eval-cos-1vs1.py index 8da41b02..123221f2 100755 --- a/hyperion/bin/eval-cos-1vs1.py +++ b/hyperion/bin/eval-cos-1vs1.py @@ -22,9 +22,10 @@ from hyperion.transforms import TransformList, LNorm -def eval_cos(iv_file, ndx_file, enroll_file, test_file, - preproc_file, score_file, **kwargs): - +def eval_cos( + iv_file, ndx_file, enroll_file, test_file, preproc_file, score_file, **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -37,41 +38,44 @@ def eval_cos(iv_file, ndx_file, enroll_file, test_file, lnorm = LNorm() x_e = lnorm.predict(x_e) x_t = lnorm.predict(x_t) - + t1 = time.time() scores = np.dot(x_e, x_t.T) - + dt = time.time() - t1 num_trials = x_e.shape[0] * x_t.shape[0] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(enroll, ndx.seg_set, scores) s.save(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval Trials by cosine scoring') + fromfile_prefix_chars="@", + description="Eval Trials by cosine scoring", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose assert args.test_file is not None or args.ndx_file is not None eval_cos(**vars(args)) - - diff --git a/hyperion/bin/eval-linear-gbe-up.py b/hyperion/bin/eval-linear-gbe-up.py index 67e0bd53..287117fd 100755 --- a/hyperion/bin/eval-linear-gbe-up.py +++ b/hyperion/bin/eval-linear-gbe-up.py @@ -24,11 +24,19 @@ from hyperion.classifiers import LinearGBEUP as GBE -def eval_linear_gbe(iv_file, class2int_file, test_file, - preproc_file, - model_file, score_file, vector_score_file, - normalize, eval_method, **kwargs): - +def eval_linear_gbe( + iv_file, + class2int_file, + test_file, + preproc_file, + model_file, + score_file, + vector_score_file, + normalize, + eval_method, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,45 +47,48 @@ def eval_linear_gbe(iv_file, class2int_file, test_file, x, ndx = tdr.read() model = GBE.load(model_file) - + t1 = time.time() scores = model.predict(x, eval_method, normalize) - + dt = time.time() - t1 - num_trials = scores.shape[0]*scores.shape[1] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(ndx.model_set, ndx.seg_set, scores.T) s.save(score_file) if vector_score_file is not None: h5 = HDW(vector_score_file) - h5.write(ndx.seg_set, '', scores) + h5.write(ndx.seg_set, "", scores) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval linear Gaussian back-end') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval linear Gaussian back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--class2int-file", dest="class2int_file", required=True) + parser.add_argument("--test-file", dest="test_file", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--class2int-file', dest='class2int_file', required=True) - parser.add_argument('--test-file', dest='test_file', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - TDR.add_argparse_args(parser) GBE.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--vector-score-file', dest='vector_score_file', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--vector-score-file", dest="vector_score_file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose eval_linear_gbe(**vars(args)) - - diff --git a/hyperion/bin/eval-linear-gbe.py b/hyperion/bin/eval-linear-gbe.py index ba7c2154..a93b6c39 100755 --- a/hyperion/bin/eval-linear-gbe.py +++ b/hyperion/bin/eval-linear-gbe.py @@ -24,11 +24,19 @@ from hyperion.classifiers import LinearGBE as GBE -def eval_linear_gbe(iv_file, class2int_file, test_file, - preproc_file, - model_file, score_file, vector_score_file, - normalize, eval_method, **kwargs): - +def eval_linear_gbe( + iv_file, + class2int_file, + test_file, + preproc_file, + model_file, + score_file, + vector_score_file, + normalize, + eval_method, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,46 +47,49 @@ def eval_linear_gbe(iv_file, class2int_file, test_file, x, ndx = tdr.read() model = GBE.load(model_file) - + t1 = time.time() scores = model.predict(x, eval_method, normalize) - + dt = time.time() - t1 - num_trials = scores.shape[0]*scores.shape[1] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(ndx.model_set, ndx.seg_set, scores.T) s.save(score_file) if vector_score_file is not None: h5 = HDW(vector_score_file) - h5.write(ndx.seg_set, '', scores) + h5.write(ndx.seg_set, "", scores) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval linear Gaussian back-end') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval linear Gaussian back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--class2int-file", dest="class2int_file", required=True) + parser.add_argument("--test-file", dest="test_file", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--class2int-file', dest='class2int_file', required=True) - parser.add_argument('--test-file', dest='test_file', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - TDR.add_argparse_args(parser) GBE.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--vector-score-file', dest='vector_score_file', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--vector-score-file", dest="vector_score_file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_linear_gbe(**vars(args)) - + eval_linear_gbe(**vars(args)) diff --git a/hyperion/bin/eval-linear-svmc.py b/hyperion/bin/eval-linear-svmc.py index a58fb77a..ff7b1faa 100755 --- a/hyperion/bin/eval-linear-svmc.py +++ b/hyperion/bin/eval-linear-svmc.py @@ -24,11 +24,18 @@ from hyperion.classifiers import LinearSVMC as SVM -def eval_svm(iv_file, class2int_file, test_file, - preproc_file, - model_file, score_file, vector_score_file, - eval_type, **kwargs): - +def eval_svm( + iv_file, + class2int_file, + test_file, + preproc_file, + model_file, + score_file, + vector_score_file, + eval_type, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,46 +46,49 @@ def eval_svm(iv_file, class2int_file, test_file, x, ndx = tdr.read() model = SVM.load(model_file) - + t1 = time.time() scores = model.predict(x, eval_type) - + dt = time.time() - t1 - num_trials = scores.shape[0]*scores.shape[1] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(ndx.model_set, ndx.seg_set, scores.T) s.save(score_file) if vector_score_file is not None: h5 = HDW(vector_score_file) - h5.write(ndx.seg_set, '', scores) + h5.write(ndx.seg_set, "", scores) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval SVM classifier') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval SVM classifier", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--class2int-file", dest="class2int_file", required=True) + parser.add_argument("--test-file", dest="test_file", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--class2int-file', dest='class2int_file', required=True) - parser.add_argument('--test-file', dest='test_file', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - TDR.add_argparse_args(parser) SVM.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--vector-score-file', dest='vector_score_file', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--vector-score-file", dest="vector_score_file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_svm(**vars(args)) - - diff --git a/hyperion/bin/eval-logistic-regression.py b/hyperion/bin/eval-logistic-regression.py index 825a37c6..d96e2473 100755 --- a/hyperion/bin/eval-logistic-regression.py +++ b/hyperion/bin/eval-logistic-regression.py @@ -24,11 +24,18 @@ from hyperion.classifiers import LogisticRegression as LR -def eval_lr(iv_file, class2int_file, test_file, - preproc_file, - model_file, score_file, vector_score_file, - eval_type, **kwargs): - +def eval_lr( + iv_file, + class2int_file, + test_file, + preproc_file, + model_file, + score_file, + vector_score_file, + eval_type, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,45 +46,49 @@ def eval_lr(iv_file, class2int_file, test_file, x, ndx = tdr.read() model = LR.load(model_file) - + t1 = time.time() scores = model.predict(x, eval_type) - + dt = time.time() - t1 - num_trials = scores.shape[0]*scores.shape[1] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(ndx.model_set, ndx.seg_set, scores.T) s.save(score_file) if vector_score_file is not None: h5 = HDW(vector_score_file) - h5.write(ndx.seg_set, '', scores) + h5.write(ndx.seg_set, "", scores) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval linear logistic regression classifier') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval linear logistic regression classifier", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--class2int-file", dest="class2int_file", required=True) + parser.add_argument("--test-file", dest="test_file", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--class2int-file', dest='class2int_file', required=True) - parser.add_argument('--test-file', dest='test_file', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - TDR.add_argparse_args(parser) LR.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--vector-score-file', dest='vector_score_file', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--vector-score-file", dest="vector_score_file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_lr(**vars(args)) + eval_lr(**vars(args)) diff --git a/hyperion/bin/eval-plda-1vs1.py b/hyperion/bin/eval-plda-1vs1.py index ee753a85..715d043a 100755 --- a/hyperion/bin/eval-plda-1vs1.py +++ b/hyperion/bin/eval-plda-1vs1.py @@ -23,11 +23,18 @@ from hyperion.transforms import TransformList +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + plda_type, + **kwargs +): -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, plda_type, **kwargs): - if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,44 +45,46 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, x_e, x_t, enroll, ndx = tdr.read() model = F.load_plda(plda_type, model_file) - + t1 = time.time() scores = model.llr_1vs1(x_e, x_t) - + dt = time.time() - t1 num_trials = x_e.shape[0] * x_t.shape[0] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(enroll, ndx.seg_set, scores) s.save(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/hyperion/bin/eval-plda-nvs1.py b/hyperion/bin/eval-plda-nvs1.py index 9a6a0536..30ea2606 100755 --- a/hyperion/bin/eval-plda-nvs1.py +++ b/hyperion/bin/eval-plda-nvs1.py @@ -23,10 +23,18 @@ from hyperion.transforms import TransformList -def eval_plda(iv_file, ndx_file, enroll_file, test_file, - preproc_file, - model_file, score_file, pool_method, **kwargs): - +def eval_plda( + iv_file, + ndx_file, + enroll_file, + test_file, + preproc_file, + model_file, + score_file, + pool_method, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,49 +46,55 @@ def eval_plda(iv_file, ndx_file, enroll_file, test_file, enroll, ids_e = np.unique(enroll, return_inverse=True) model = F.load_plda(plda_type, model_file) - + t1 = time.time() scores = model.llr_Nvs1(x_e, x_t, method=pool_method, ids1=ids_e) - + dt = time.time() - t1 num_trials = len(enroll) * x_t.shape[0] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(enroll, ndx.seg_set, scores) s.save(score_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval PLDA N vs 1') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval PLDA N vs 1", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-file', dest='test_file', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-file", dest="test_file", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) TDR.add_argparse_args(parser) F.add_argparse_eval_args(parser) - parser.add_argument('--pool-method', dest='pool_method', type=str.lower, - default='vavg-lnorm', - choices=['book','vavg','vavg-lnorm','savg'], - help=('(default: %(default)s)')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--pool-method", + dest="pool_method", + type=str.lower, + default="vavg-lnorm", + choices=["book", "vavg", "vavg-lnorm", "savg"], + help=("(default: %(default)s)"), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + assert args.test_file is not None or args.ndx_file is not None eval_plda(**vars(args)) - - diff --git a/hyperion/bin/make-babble-noise-audio-files.py b/hyperion/bin/make-babble-noise-audio-files.py index d804244f..460f4044 100755 --- a/hyperion/bin/make-babble-noise-audio-files.py +++ b/hyperion/bin/make-babble-noise-audio-files.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -21,30 +26,35 @@ def make_noise(xs): - + lens = np.array([x.shape[0] for x in xs]) max_len = np.max(lens) - num_tiles = np.ceil(max_len/lens) + num_tiles = np.ceil(max_len / lens) for i in range(len(xs)): xs[i] = np.tile(xs[i], int(num_tiles[i]))[:max_len] for i in range(1, len(xs)): - xs[0] += (xs[i] - xs[i].mean()) + xs[0] += xs[i] - xs[i].mean() return xs[0] - + def make_babble_noise_audio_files( - input_path, output_path, output_script, - write_time_durs_spec, - min_spks=3, max_spks=7, num_reuses=5, - random_seed=112358, - **kwargs): + input_path, + output_path, + output_script, + write_time_durs_spec, + min_spks=3, + max_spks=7, + num_reuses=5, + random_seed=112358, + **kwargs +): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info('input_args={}'.format(input_args)) - logging.info('output_args={}'.format(output_args)) + logging.info("input_args={}".format(input_args)) + logging.info("output_args={}".format(output_args)) rng = np.random.RandomState(seed=random_seed) @@ -56,12 +66,11 @@ def make_babble_noise_audio_files( t1 = time.time() with AR(input_path, **input_args) as reader: keys = reader.keys - with Writer(output_path, output_script, - **output_args) as writer: + with Writer(output_path, output_script, **output_args) as writer: for iters in range(num_reuses): keys = rng.permutation(keys) - + cur_spks = min_spks utt_list = [] for utt_idx in range(len(keys)): @@ -72,51 +81,56 @@ def make_babble_noise_audio_files( x, fs = reader.read(utt_list) fs = fs[0] y = make_noise(x) - babble_id = 'babble-%05d' % (count) - logging.info('writing file % s' % (babble_id)) + babble_id = "babble-%05d" % (count) + logging.info("writing file % s" % (babble_id)) writer.write([babble_id], [y], [fs]) if write_time_durs_spec is not None: okeys.append(babble_id) - info.append(y.shape[0]/fs) + info.append(y.shape[0] / fs) count += 1 utt_list = [] cur_spks += 1 if cur_spks > max_spks: cur_spks = min_spks - + if write_time_durs_spec is not None: - logging.info('writing time durations to %s' % (write_time_durs_spec)) + logging.info("writing time durations to %s" % (write_time_durs_spec)) u2td = Utt2Info.create(okeys, info) u2td.save(write_time_durs_spec) - - logging.info('finished making babble files, elapsed-time=%f' % (time.time() - t1)) + + logging.info("finished making babble files, elapsed-time=%f" % (time.time() - t1)) if __name__ == "__main__": - - parser=ArgumentParser( - description='Creates babble noise by adding speech files') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output-path', required=True) - parser.add_argument('--output-script', required=True) - parser.add_argument('--write-time-durs', dest='write_time_durs_spec', default=None) + parser = ArgumentParser(description="Creates babble noise by adding speech files") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--output-script", required=True) + parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) AR.add_class_args(parser) Writer.add_class_args(parser) - parser.add_argument('--min-spks', default=3, type=int) - parser.add_argument('--max-spks', default=10, type=int) - parser.add_argument('--num-reuses', default=5, type=int) - parser.add_argument('--random-seed', default=112358, type=int) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument("--min-spks", default=3, type=int) + parser.add_argument("--max-spks", default=10, type=int) + parser.add_argument("--num-reuses", default=5, type=int) + parser.add_argument("--random-seed", default=112358, type=int) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + make_babble_noise_audio_files(**namespace_to_dict(args)) - diff --git a/hyperion/bin/merge-h5-files.py b/hyperion/bin/merge-h5-files.py index 06ff1d3b..a807c69c 100755 --- a/hyperion/bin/merge-h5-files.py +++ b/hyperion/bin/merge-h5-files.py @@ -14,26 +14,27 @@ from hyperion.io import H5Merger + def merge(input_files, output_path, chunk_size): m = H5Merger(input_files, output_path, chunk_size) m.merge() - + if __name__ == "__main__": - - parser=argparse.ArgumentParser( + + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Merges multiple hdf5 files into one file') + fromfile_prefix_chars="@", + description="Merges multiple hdf5 files into one file", + ) - parser.add_argument('--input-files', dest='input_files', nargs='+', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - #parser.add_argument('--field', dest='field', default='') - parser.add_argument('--chunk-size', dest='chunk_size', type=int, default=None) - #parser.add_argument('--squeeze', dest='squeeze', default=False, action='store_true') + parser.add_argument("--input-files", dest="input_files", nargs="+", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + # parser.add_argument('--field', dest='field', default='') + parser.add_argument("--chunk-size", dest="chunk_size", type=int, default=None) + # parser.add_argument('--squeeze', dest='squeeze', default=False, action='store_true') - args=parser.parse_args() + args = parser.parse_args() merge(**vars(args)) - diff --git a/hyperion/bin/pack-audio-files.py b/hyperion/bin/pack-audio-files.py index ebcbb9fa..4953d345 100755 --- a/hyperion/bin/pack-audio-files.py +++ b/hyperion/bin/pack-audio-files.py @@ -19,6 +19,7 @@ from hyperion.io import VADReaderFactory as VRF from hyperion.io import WSpecifier as WS + def process_vad(vad, length, fs, dilation, erosion): vad = signal.resample(vad, length) > 0.5 if dilation > 0: @@ -32,98 +33,134 @@ def process_vad(vad, length, fs, dilation, erosion): return vad -def pack_audio_files(input_path, output_spec, - vad_spec, vad_path_prefix, - vad_fs=100, vad_dilation=0, vad_erosion=0, - remove_dc_offset=False, **kwargs): - +def pack_audio_files( + input_path, + output_spec, + vad_spec, + vad_path_prefix, + vad_fs=100, + vad_dilation=0, + vad_erosion=0, + remove_dc_offset=False, + **kwargs +): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info('input_args={}'.format(input_args)) - logging.info('output_args={}'.format(output_args)) + logging.info("input_args={}".format(input_args)) + logging.info("output_args={}".format(output_args)) output_spec = WS.create(output_spec) with AR(input_path, **input_args) as reader: - with Writer(output_spec.archive, output_spec.script, - **output_args) as writer: + with Writer(output_spec.archive, output_spec.script, **output_args) as writer: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) + logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) t1 = time.time() for data in reader: key, x, fs_i = data assert writer.fs == fs_i - logging.info('Packing audio %s' % (key)) + logging.info("Packing audio %s" % (key)) t2 = time.time() tot_samples = x.shape[0] if vad_spec is not None: num_vad_frames = int(round(tot_samples * vad_fs / fs_i)) - vad = v_reader.read( - key, num_frames=num_vad_frames)[0].astype( - 'bool', copy=False) - logging.info('vad=%d/%d' % (np.sum(vad==1), len(vad))) + vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( + "bool", copy=False + ) + logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) vad = process_vad(vad, tot_samples, fs_i, vad_dilation, vad_erosion) - logging.info('vad=%d/%d' % (np.sum(vad==1), len(vad))) + logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) x = x[vad] - logging.info('utt %s detected %f/%f secs (%.2f %%) speech ' % ( - key[0], x.shape[0]/fs_i, tot_samples/fs_i, x.shape[0]/tot_samples*100)) + logging.info( + "utt %s detected %f/%f secs (%.2f %%) speech " + % ( + key[0], + x.shape[0] / fs_i, + tot_samples / fs_i, + x.shape[0] / tot_samples * 100, + ) + ) if remove_dc_offset: x -= np.mean(x) writer.write([key], [x]) t3 = time.time() - dt2 = (t2 - t1)*1000 - dt3 = (t3 - t1)*1000 - time_dur = len(x)/writer.fs - rtf = (time_dur*1000)/dt3 - logging.info(('Packed audio %s length=%0.3f secs ' - 'elapsed-time=%.2f ms. ' - 'read-time=%.2f ms. write-time=%.2f ms. ' - 'real-time-factor=%.2f' - 'x-range=[%f-%f]') % ( - key, time_dur, dt3, dt2, dt3-dt2, rtf, - np.min(x), np.max(x))) + dt2 = (t2 - t1) * 1000 + dt3 = (t3 - t1) * 1000 + time_dur = len(x) / writer.fs + rtf = (time_dur * 1000) / dt3 + logging.info( + ( + "Packed audio %s length=%0.3f secs " + "elapsed-time=%.2f ms. " + "read-time=%.2f ms. write-time=%.2f ms. " + "real-time-factor=%.2f" + "x-range=[%f-%f]" + ) + % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, np.min(x), np.max(x)) + ) t1 = time.time() - - -if __name__ == "__main__": - - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Packs multiple audio files into single audio file') - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', default=None, - help=('scp file_path prefix for vad')) - parser.add_argument('--vad-fs', default=100, type=float, - help=('vad sampling frequency')) - - parser.add_argument('--vad-dilation', default=0, type=float, - help=('applies dilation operation to vad, in secs')) +if __name__ == "__main__": - parser.add_argument('--vad-erosion', default=0, type=float, - help=('applies erosion operation to vad (after dilation), in secs')) + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Packs multiple audio files into single audio file", + ) + + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + parser.add_argument( + "--vad-fs", default=100, type=float, help=("vad sampling frequency") + ) + + parser.add_argument( + "--vad-dilation", + default=0, + type=float, + help=("applies dilation operation to vad, in secs"), + ) + + parser.add_argument( + "--vad-erosion", + default=0, + type=float, + help=("applies erosion operation to vad (after dilation), in secs"), + ) AR.add_argparse_args(parser) Writer.add_argparse_args(parser) - parser.add_argument('--remove-dc-offset', default=False, action='store_true', - help='removes dc offset from file') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument( + "--remove-dc-offset", + default=False, + action="store_true", + help="removes dc offset from file", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + pack_audio_files(**vars(args)) - diff --git a/hyperion/bin/pack-wav-rirs.py b/hyperion/bin/pack-wav-rirs.py index 5833d825..00177988 100755 --- a/hyperion/bin/pack-wav-rirs.py +++ b/hyperion/bin/pack-wav-rirs.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -19,38 +24,46 @@ def pack_wav_rirs(input_path, output_spec, **kwargs): - writer = DWF.create(output_spec, scp_sep=' ', compress=False) + writer = DWF.create(output_spec, scp_sep=" ", compress=False) t1 = time.time() with AR(input_path, wav_scale=1) as reader: for data in reader: key, h, fs = data if h.ndim == 2: - h = h[:,0] + h = h[:, 0] h_delay = np.argmax(np.abs(h)) h_max = h[h_delay] h /= h_max - h[h<1e-3] = 0 + h[h < 1e-3] = 0 h = np.trim_zeros(h) - logging.info('Packing rir %s h_max=%f h_delay=%d h-length=%d' % (key, h_max, h_delay, len(h))) + logging.info( + "Packing rir %s h_max=%f h_delay=%d h-length=%d" + % (key, h_max, h_delay, len(h)) + ) writer.write([key], [h]) - logging.info('Packed RIRS elapsed-time=%.f' % (time.time()-t1)) - + logging.info("Packed RIRS elapsed-time=%.f" % (time.time() - t1)) + if __name__ == "__main__": - - parser=ArgumentParser( - description='Packs RIRs in wave format to h5/ark files') - - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + + parser = ArgumentParser(description="Packs RIRs in wave format to h5/ark files") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + pack_wav_rirs(**namespace_to_dict(args)) - diff --git a/hyperion/bin/plot-vector-hist.py b/hyperion/bin/plot-vector-hist.py index d7e1f0a6..cd86b1c1 100755 --- a/hyperion/bin/plot-vector-hist.py +++ b/hyperion/bin/plot-vector-hist.py @@ -12,7 +12,8 @@ import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.hyp_defs import config_logger @@ -20,8 +21,10 @@ from hyperion.transforms import TransformList -def plot_vector_hist(iv_file, v_list, preproc_file, output_path, num_bins, normed, **kwargs): - +def plot_vector_hist( + iv_file, v_list, preproc_file, output_path, num_bins, normed, **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -37,44 +40,45 @@ def plot_vector_hist(iv_file, v_list, preproc_file, output_path, num_bins, norme os.makedirs(ouput_path) for i in range(x.shape[1]): - - fig_file = '%s/D%04d.pdf' % (output_path, i) - - plt.hist(x[:,i], num_bins, normed=normed) - plt.xlabel('Dim %d' % i) + + fig_file = "%s/D%04d.pdf" % (output_path, i) + + plt.hist(x[:, i], num_bins, normed=normed) + plt.xlabel("Dim %d" % i) plt.grid(True) plt.show() plt.savefig(fig_file) plt.clf() - - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Plots historgrams of i-vectors') + fromfile_prefix_chars="@", + description="Plots historgrams of i-vectors", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--v-list", dest="v_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--v-list', dest='v_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - VR.add_argparse_args(parser) - - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--no-normed', dest='normed', default=True, - action='store_false') - parser.add_argument('--num-bins', dest='num_bins', type=int, default=100) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "--no-normed", dest="normed", default=True, action="store_false" + ) + parser.add_argument("--num-bins", dest="num_bins", type=int, default=100) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - plot_vector_hist(**vars(args)) - + plot_vector_hist(**vars(args)) diff --git a/hyperion/bin/plot-vector-tsne.py b/hyperion/bin/plot-vector-tsne.py index 353cb029..030d7e39 100755 --- a/hyperion/bin/plot-vector-tsne.py +++ b/hyperion/bin/plot-vector-tsne.py @@ -12,7 +12,8 @@ import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D as plt3d @@ -23,15 +24,29 @@ from hyperion.helpers import VectorClassReader as VCR from hyperion.transforms import TransformList, PCA -colors = ['b', 'g', 'r', 'c', 'm', 'y', 'k'] -markers = ['x', 'o', '+', '*', 's', 'h', 'D', '^', 'v', 'p', '8'] - -def plot_vector_tsne(iv_file, v_list, preproc_file, - output_path, save_embed, output_dim, - perplexity, exag, lr, num_iter, init_method, - rng_seed, verbose, pca_dim, max_classes, **kwargs): +colors = ["b", "g", "r", "c", "m", "y", "k"] +markers = ["x", "o", "+", "*", "s", "h", "D", "^", "v", "p", "8"] + + +def plot_vector_tsne( + iv_file, + v_list, + preproc_file, + output_path, + save_embed, + output_dim, + perplexity, + exag, + lr, + num_iter, + init_method, + rng_seed, + verbose, + pca_dim, + max_classes, + **kwargs +): - if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -54,42 +69,50 @@ def plot_vector_tsne(iv_file, v_list, preproc_file, tsne_obj = lambda n: TSNE( n_components=n, - perplexity=perplexity, early_exaggeration=exag, - learning_rate=lr, n_iter=num_iter, init=init_method, - random_state=rng_seed, verbose=verbose) - + perplexity=perplexity, + early_exaggeration=exag, + learning_rate=lr, + n_iter=num_iter, + init=init_method, + random_state=rng_seed, + verbose=verbose, + ) if max_classes > 0: index = class_ids < max_classes x = x[index] class_ids = class_ids[index] - + if output_dim > 3: tsne = tsne_obj(output_dim) y = tsne.fit_transform(x) if save_embed: - h5_file = '%s/embed_%dd.h5' % (output_path, ouput_dim) + h5_file = "%s/embed_%dd.h5" % (output_path, ouput_dim) hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) tsne = tsne_obj(2) y = tsne.fit_transform(x) if save_embed: - h5_file = '%s/embed_2d.h5' % output_path + h5_file = "%s/embed_2d.h5" % output_path hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) - - fig_file = '%s/tsne_2d.pdf' % (output_path) + + fig_file = "%s/tsne_2d.pdf" % (output_path) # plt.scatter(y[:,0], y[:,1], c=class_ids, marker='x') - color_marker = [(c,m) for m in markers for c in colors] + color_marker = [(c, m) for m in markers for c in colors] for c in np.unique(class_ids): idx = class_ids == c - plt.scatter(y[idx,0], y[idx,1], - c=color_marker[c][0], marker=color_marker[c][1], - label=vcr.class_names[c]) - + plt.scatter( + y[idx, 0], + y[idx, 1], + c=color_marker[c][0], + marker=color_marker[c][1], + label=vcr.class_names[c], + ) + plt.legend() plt.grid(True) plt.show() @@ -104,25 +127,28 @@ def plot_vector_tsne(iv_file, v_list, preproc_file, # plt.show() # plt.savefig(fig_file) # plt.clf() - tsne = tsne_obj(3) y = tsne.fit_transform(x) if save_embed: - h5_file = '%s/embed_3d.h5' % output_path + h5_file = "%s/embed_3d.h5" % output_path hw = DWF.create(h5_file) hw.write(vcr.u2c.key, y) - - fig_file = '%s/tsne_3d.pdf' % (output_path) + fig_file = "%s/tsne_3d.pdf" % (output_path) fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - #ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') + ax = fig.add_subplot(111, projection="3d") + # ax.scatter(y[:,0], y[:,1], y[:,2], c=class_ids, marker='x') for c in np.unique(class_ids): idx = class_ids == c - ax.scatter(y[idx,0], y[idx,1], y[idx,2], - c=color_marker[c][0], marker=color_marker[c][1], - label=vcr.class_names[c]) + ax.scatter( + y[idx, 0], + y[idx, 1], + y[idx, 2], + c=color_marker[c][0], + marker=color_marker[c][1], + label=vcr.class_names[c], + ) plt.grid(True) plt.show() @@ -139,42 +165,45 @@ def plot_vector_tsne(iv_file, v_list, preproc_file, # plt.savefig(fig_file) # plt.clf() - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + - - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Plots TSNE embeddings') + fromfile_prefix_chars="@", + description="Plots TSNE embeddings", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--v-list', dest='v_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--v-list", dest="v_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--save-embed', dest='save_embed', default=False, action='store_true') - - parser.add_argument('--output-dim', dest='output_dim', type=int, default=3) - parser.add_argument('--perplexity', dest='perplexity', type=float, default=30) - parser.add_argument('--exag', dest='exag', type=float, default=12) - parser.add_argument('--lr', dest='lr', type=float, default=200) - parser.add_argument('--num-iter', dest='num_iter', type=int, default=1000) - parser.add_argument('--init-method', dest='init_method', default='pca', - choices=['random', 'pca']) - parser.add_argument('--rng-seed', dest='rng_seed', type=int, default=1024) - parser.add_argument('--pca-dim', dest='pca_dim', type=int, default=50) - parser.add_argument('--max-classes', dest='max_classes', type=int, default=10) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "--save-embed", dest="save_embed", default=False, action="store_true" + ) + + parser.add_argument("--output-dim", dest="output_dim", type=int, default=3) + parser.add_argument("--perplexity", dest="perplexity", type=float, default=30) + parser.add_argument("--exag", dest="exag", type=float, default=12) + parser.add_argument("--lr", dest="lr", type=float, default=200) + parser.add_argument("--num-iter", dest="num_iter", type=int, default=1000) + parser.add_argument( + "--init-method", dest="init_method", default="pca", choices=["random", "pca"] + ) + parser.add_argument("--rng-seed", dest="rng_seed", type=int, default=1024) + parser.add_argument("--pca-dim", dest="pca_dim", type=int, default=50) + parser.add_argument("--max-classes", dest="max_classes", type=int, default=10) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) logging.debug(args) - - plot_vector_tsne(**vars(args)) - + plot_vector_tsne(**vars(args)) diff --git a/hyperion/bin/preprocess-audio-files.py b/hyperion/bin/preprocess-audio-files.py index 628b6c53..67b1cf61 100755 --- a/hyperion/bin/preprocess-audio-files.py +++ b/hyperion/bin/preprocess-audio-files.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -19,6 +24,7 @@ from hyperion.io import AudioWriter as Writer from hyperion.io import VADReaderFactory as VRF + def process_vad(vad, length, fs, dilation, erosion): vad = signal.resample(vad, length) > 0.5 if dilation > 0: @@ -32,51 +38,63 @@ def process_vad(vad, length, fs, dilation, erosion): return vad -def process_audio_files(input_path, output_path, output_script, - write_time_durs_spec, - vad_spec, vad_path_prefix, - vad_fs=100, vad_dilation=0, vad_erosion=0, - remove_dc_offset=False, **kwargs): - +def process_audio_files( + input_path, + output_path, + output_script, + write_time_durs_spec, + vad_spec, + vad_path_prefix, + vad_fs=100, + vad_dilation=0, + vad_erosion=0, + remove_dc_offset=False, + **kwargs +): input_args = AR.filter_args(**kwargs) output_args = Writer.filter_args(**kwargs) - logging.info('input_args={}'.format(input_args)) - logging.info('output_args={}'.format(output_args)) + logging.info("input_args={}".format(input_args)) + logging.info("output_args={}".format(output_args)) if write_time_durs_spec is not None: keys = [] info = [] with AR(input_path, **input_args) as reader: - with Writer(output_path, output_script, - **output_args) as writer: + with Writer(output_path, output_script, **output_args) as writer: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) + logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) t1 = time.time() for data in reader: key, x, fs = data - logging.info('Processing audio %s' % (key)) + logging.info("Processing audio %s" % (key)) t2 = time.time() tot_samples = x.shape[0] if vad_spec is not None: num_vad_frames = int(round(tot_samples * vad_fs / fs)) - vad = v_reader.read( - key, num_frames=num_vad_frames)[0].astype( - 'bool', copy=False) - logging.info('vad=%d/%d' % (np.sum(vad==1), len(vad))) + vad = v_reader.read(key, num_frames=num_vad_frames)[0].astype( + "bool", copy=False + ) + logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) vad = process_vad(vad, tot_samples, fs, vad_dilation, vad_erosion) - logging.info('vad=%d/%d' % (np.sum(vad==1), len(vad))) + logging.info("vad=%d/%d" % (np.sum(vad == 1), len(vad))) x = x[vad] - logging.info('utt %s detected %f/%f secs (%.2f %%) speech ' % ( - key[0], x.shape[0]/fs, tot_samples/fs, x.shape[0]/tot_samples*100)) + logging.info( + "utt %s detected %f/%f secs (%.2f %%) speech " + % ( + key[0], + x.shape[0] / fs, + tot_samples / fs, + x.shape[0] / tot_samples * 100, + ) + ) - if x.shape[0] > 0: if remove_dc_offset: x -= np.mean(x) @@ -84,7 +102,7 @@ def process_audio_files(input_path, output_path, output_script, writer.write([key], [x], [fs]) if write_time_durs_spec is not None: keys.append(key) - info.append(x.shape[0]/fs) + info.append(x.shape[0] / fs) xmax = np.max(x) xmin = np.min(x) @@ -93,59 +111,82 @@ def process_audio_files(input_path, output_path, output_script, xmin = 0 t3 = time.time() - dt2 = (t2 - t1)*1000 - dt3 = (t3 - t1)*1000 - time_dur = len(x)/fs - rtf = (time_dur*1000)/dt3 - logging.info(('Packed audio %s length=%0.3f secs ' - 'elapsed-time=%.2f ms. ' - 'read-time=%.2f ms. write-time=%.2f ms. ' - 'real-time-factor=%.2f' - 'x-range=[%f-%f]') % ( - key, time_dur, dt3, dt2, dt3-dt2, rtf, - xmin, xmax)) + dt2 = (t2 - t1) * 1000 + dt3 = (t3 - t1) * 1000 + time_dur = len(x) / fs + rtf = (time_dur * 1000) / dt3 + logging.info( + ( + "Packed audio %s length=%0.3f secs " + "elapsed-time=%.2f ms. " + "read-time=%.2f ms. write-time=%.2f ms. " + "real-time-factor=%.2f" + "x-range=[%f-%f]" + ) + % (key, time_dur, dt3, dt2, dt3 - dt2, rtf, xmin, xmax) + ) t1 = time.time() - if write_time_durs_spec is not None: - logging.info('writing time durations to %s' % (write_time_durs_spec)) + logging.info("writing time durations to %s" % (write_time_durs_spec)) u2td = Utt2Info.create(keys, info) u2td.save(write_time_durs_spec) - - -if __name__ == "__main__": - - parser=ArgumentParser( - description='Process pipes in wav.scp file, optionally applies vad and save all audios in the same format') - - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output-path', required=True) - parser.add_argument('--output-script', required=True) - parser.add_argument('--write-time-durs', dest='write_time_durs_spec', default=None) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', default=None, - help=('scp file_path prefix for vad')) - parser.add_argument('--vad-fs', default=100, type=float, - help=('vad sampling frequency')) - parser.add_argument('--vad-dilation', default=0, type=float, - help=('applies dilation operation to vad, in secs')) +if __name__ == "__main__": - parser.add_argument('--vad-erosion', default=0, type=float, - help=('applies erosion operation to vad (after dilation), in secs')) + parser = ArgumentParser( + description="Process pipes in wav.scp file, optionally applies vad and save all audios in the same format" + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output-path", required=True) + parser.add_argument("--output-script", required=True) + parser.add_argument("--write-time-durs", dest="write_time_durs_spec", default=None) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + parser.add_argument( + "--vad-fs", default=100, type=float, help=("vad sampling frequency") + ) + + parser.add_argument( + "--vad-dilation", + default=0, + type=float, + help=("applies dilation operation to vad, in secs"), + ) + + parser.add_argument( + "--vad-erosion", + default=0, + type=float, + help=("applies erosion operation to vad (after dilation), in secs"), + ) AR.add_class_args(parser) Writer.add_class_args(parser) - parser.add_argument('--remove-dc-offset', default=False, action='store_true', - help='removes dc offset from file') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument( + "--remove-dc-offset", + default=False, + action="store_true", + help="removes dc offset from file", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + process_audio_files(**namespace_to_dict(args)) - diff --git a/hyperion/bin/rttm-to-bin-vad.py b/hyperion/bin/rttm-to-bin-vad.py index 24bd26ac..9c51ba2c 100755 --- a/hyperion/bin/rttm-to-bin-vad.py +++ b/hyperion/bin/rttm-to-bin-vad.py @@ -16,16 +16,30 @@ from hyperion.utils import SegmentList, RTTM from hyperion.io import DataWriterFactory as DWF -def rttm_to_bin_vad(rttm_file, num_frames_file, frame_shift, output_path, - fix_empy_files, part_idx, num_parts): + +def rttm_to_bin_vad( + rttm_file, + num_frames_file, + frame_shift, + output_path, + fix_empy_files, + part_idx, + num_parts, +): num_frames = None if num_frames_file is not None: - utt2num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, names=['file_id','num_frames'], index_col=0) + utt2num_frames = pd.read_csv( + num_frames_file, + sep="\s+", + header=None, + names=["file_id", "num_frames"], + index_col=0, + ) segments = RTTM.load(rttm_file).to_segment_list() - if num_parts > 1: + if num_parts > 1: if fix_empy_files: segments_orig = copy.deepcopy(segments) segments = segments.split(part_idx, num_parts) @@ -34,55 +48,97 @@ def rttm_to_bin_vad(rttm_file, num_frames_file, frame_shift, output_path, with DWF.create(output_path) as writer: for file_id in segments.uniq_file_id: - logging.info('processing VAD for %s' % (file_id)) + logging.info("processing VAD for %s" % (file_id)) if num_frames_file is not None: - num_frames = int(utt2num_frames.loc[file_id]['num_frames']) - vad = segments.to_bin_vad(file_id, frame_shift=frame_shift, num_frames=num_frames) + num_frames = int(utt2num_frames.loc[file_id]["num_frames"]) + vad = segments.to_bin_vad( + file_id, frame_shift=frame_shift, num_frames=num_frames + ) num_speech_frames = np.sum(vad) - logging.info('for %s detected %d/%d (%.2f %%) speech frames' % ( - file_id, num_speech_frames, num_frames, num_speech_frames/num_frames*100)) + logging.info( + "for %s detected %d/%d (%.2f %%) speech frames" + % ( + file_id, + num_speech_frames, + num_frames, + num_speech_frames / num_frames * 100, + ) + ) writer.write(file_id, vad) if fix_empy_files and part_idx == 1: for file_id in utt2num_frames.index: - if not(file_id in segments_orig.uniq_file_id): - logging.warning('not speeech detected in %s, putting all to 1' % (file_id)) - num_frames = int(utt2num_frames.loc[file_id]['num_frames']) - vad = np.ones((num_frames,), dtype='float32') + if not (file_id in segments_orig.uniq_file_id): + logging.warning( + "not speeech detected in %s, putting all to 1" % (file_id) + ) + num_frames = int(utt2num_frames.loc[file_id]["num_frames"]) + vad = np.ones((num_frames,), dtype="float32") writer.write(file_id, vad) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='RTTM file to binary vad') - - parser.add_argument('--rttm', dest='rttm_file', required=True, - help='rttm file') - parser.add_argument('--num-frames', dest='num_frames_file', default=None, - help='num. frames in feature matrix') - parser.add_argument('--frame-shift', dest='frame_shift', default=10, type=float, - help='frame shift of feature matrix in ms.') - parser.add_argument('--output-path', dest='output_path', required=True, - help='wspecifier for binary vad file') - parser.add_argument('--fix-empy-files', dest='fix_empy_files', - default=False, action='store_true', - help='puts all vad frames to 1 when file is missing in rttm') - - parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + fromfile_prefix_chars="@", + description="RTTM file to binary vad", + ) + + parser.add_argument("--rttm", dest="rttm_file", required=True, help="rttm file") + parser.add_argument( + "--num-frames", + dest="num_frames_file", + default=None, + help="num. frames in feature matrix", + ) + parser.add_argument( + "--frame-shift", + dest="frame_shift", + default=10, + type=float, + help="frame shift of feature matrix in ms.", + ) + parser.add_argument( + "--output-path", + dest="output_path", + required=True, + help="wspecifier for binary vad file", + ) + parser.add_argument( + "--fix-empy-files", + dest="fix_empy_files", + default=False, + action="store_true", + help="puts all vad frames to 1 when file is missing in rttm", + ) + + parser.add_argument( + "--part-idx", + dest="part_idx", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "--num-parts", + dest="num_parts", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - rttm_to_bin_vad(**vars(args)) - + rttm_to_bin_vad(**vars(args)) diff --git a/hyperion/bin/segments-to-bin-vad.py b/hyperion/bin/segments-to-bin-vad.py index 410d9e37..2b3a7d91 100755 --- a/hyperion/bin/segments-to-bin-vad.py +++ b/hyperion/bin/segments-to-bin-vad.py @@ -5,7 +5,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -16,53 +21,101 @@ from hyperion.utils import SegmentList from hyperion.io import DataWriterFactory as DWF -def segments_to_bin_vad(segments_file, num_frames_file, frame_shift, output_path, part_idx, num_parts): + +def segments_to_bin_vad( + segments_file, num_frames_file, frame_shift, output_path, part_idx, num_parts +): num_frames = None if num_frames_file is not None: - utt2num_frames = pd.read_csv(num_frames_file, sep='\s+', header=None, names=['file_id','num_frames'], index_col=0) + utt2num_frames = pd.read_csv( + num_frames_file, + sep="\s+", + header=None, + names=["file_id", "num_frames"], + index_col=0, + ) segments = SegmentList.load(segments_file) - if num_parts > 1: + if num_parts > 1: segments = segments.split(part_idx, num_parts) with DWF.create(output_path) as writer: for file_id in segments.uniq_file_id: - logging.info('processing VAD for %s' % (file_id)) + logging.info("processing VAD for %s" % (file_id)) if num_frames_file is not None: - num_frames = int(utt2num_frames.loc[file_id]['num_frames']) - vad = segments.to_bin_vad(file_id, frame_shift=frame_shift, num_frames=num_frames) + num_frames = int(utt2num_frames.loc[file_id]["num_frames"]) + vad = segments.to_bin_vad( + file_id, frame_shift=frame_shift, num_frames=num_frames + ) num_speech_frames = np.sum(vad) - logging.info('for %s detected %d/%d (%.2f %%) speech frames' % ( - file_id, num_speech_frames, num_frames, num_speech_frames/num_frames*100)) + logging.info( + "for %s detected %d/%d (%.2f %%) speech frames" + % ( + file_id, + num_speech_frames, + num_frames, + num_speech_frames / num_frames * 100, + ) + ) writer.write(file_id, vad) - - + if __name__ == "__main__": - parser = ArgumentParser( - description='Segments file to binary vad') + parser = ArgumentParser(description="Segments file to binary vad") - parser.add_argument('--segments', dest='segments_file', required=True, - help='kaldi format segments file') - parser.add_argument('--num-frames', dest='num_frames_file', default=None, - help='num. frames in feature matrix') - parser.add_argument('--frame-shift', dest='frame_shift', default=10, type=float, - help='frame shift of feature matrix in ms.') - parser.add_argument('--output-path', dest='output_path', required=True, - help='wspecifier for binary vad file') - parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument( + "--segments", + dest="segments_file", + required=True, + help="kaldi format segments file", + ) + parser.add_argument( + "--num-frames", + dest="num_frames_file", + default=None, + help="num. frames in feature matrix", + ) + parser.add_argument( + "--frame-shift", + dest="frame_shift", + default=10, + type=float, + help="frame shift of feature matrix in ms.", + ) + parser.add_argument( + "--output-path", + dest="output_path", + required=True, + help="wspecifier for binary vad file", + ) + parser.add_argument( + "--part-idx", + dest="part_idx", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "--num-parts", + dest="num_parts", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - segments_to_bin_vad(**namespace_to_dict(args)) - + segments_to_bin_vad(**namespace_to_dict(args)) diff --git a/hyperion/bin/torch-adv-finetune-xvec-from-wav.py b/hyperion/bin/torch-adv-finetune-xvec-from-wav.py index fd7cdb77..eb118102 100755 --- a/hyperion/bin/torch-adv-finetune-xvec-from-wav.py +++ b/hyperion/bin/torch-adv-finetune-xvec-from-wav.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -29,72 +34,84 @@ from hyperion.torch import TorchModelLoader as TML -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) + xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: - logging.info('xvector network ft args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) - if train_mode == 'ft-embed-affine': + if train_mode == "ft-embed-affine": model.freeze_preembed_layers() if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def init_attack(feat_extractor, model, wav_scale, **kwargs): - victim_model = nn.Sequential( - feat_extractor, - model) - attack_args = AttackFactory.filter_args(**kwargs['attack']) - extra_args = {'eps_scale': wav_scale, - 'loss': nn.functional.cross_entropy, - 'time_dim': 1} + victim_model = nn.Sequential(feat_extractor, model) + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = { + "eps_scale": wav_scale, + "loss": nn.functional.cross_entropy, + "time_dim": 1, + } attack_args.update(extra_args) - logging.info('attacks args={}'.format(attack_args)) + logging.info("attacks args={}".format(attack_args)) attack = AttackFactory.create(victim_model, **attack_args) return attack @@ -107,27 +124,34 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') - - train_mode = kwargs['train_mode'] + set_float_cpu("float32") + + train_mode = kwargs["train_mode"] ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) - kwargs['wav_scale'] = train_loader.dataset.wav_scale + kwargs["wav_scale"] = train_loader.dataset.wav_scale attack = init_attack(feat_extractor, model, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, attack, - device=device, metrics=metrics, - ddp=world_size>1, train_mode=train_mode, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + attack, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -135,62 +159,75 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() - -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Fine-tune x-vector model with adv attacks on wav domain') + description="Fine-tune x-vector model with adv attacks on wav domain" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) AD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') - parser.add_argument('--in-model-path', required=True) + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--in-model-path", required=True) XVec.add_finetune_args(parser) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") Trainer.add_class_args(parser) ddp.add_ddp_args(parser) # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--train-mode', default='ft-full', - choices=['ft-full', 'ft-embed-affine'], - help=('ft-full: adapt full x-vector network' - 'ft-embed-affine: adapt affine transform before embedding')) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-full", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) # parser.add_argument('--attack-eps', required=True, type=float, # help='epsilon adversarial attack') # parser.add_argument('--attack-eps-step', required=True, type=float, # help='eps step adversarial attack') - # parser.add_argument('--attack-random-eps', default=False, + # parser.add_argument('--attack-random-eps', default=False, # action='store_true', # help='use random eps in adv. attack') - # parser.add_argument('--attack-max-iter', default=10, type=int, + # parser.add_argument('--attack-max-iter', default=10, type=int, # help='number of iterations for adversarial optimization') - # parser.add_argument('--p-attack', default=0.5, type=float, + # parser.add_argument('--p-attack', default=0.5, type=float, # help='ratio of batches with adv attack') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -198,13 +235,11 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - - diff --git a/hyperion/bin/torch-adv-finetune-xvec.py b/hyperion/bin/torch-adv-finetune-xvec.py index 891057ca..ae2cb37b 100755 --- a/hyperion/bin/torch-adv-finetune-xvec.py +++ b/hyperion/bin/torch-adv-finetune-xvec.py @@ -7,10 +7,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -27,55 +32,56 @@ from hyperion.torch import TorchModelLoader as TML -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) + xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: - logging.info('xvector network ft args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) - if train_mode == 'ft-embed-affine': + if train_mode == "ft-embed-affine": model.freeze_preembed_layers() if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def init_attack(model, **kwargs): - attack_args = AttackFactory.filter_args(**kwargs['attack']) - extra_args = {'eps_scale': 1, - 'loss': nn.functional.cross_entropy, - 'time_dim': 1} + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": 1, "loss": nn.functional.cross_entropy, "time_dim": 1} attack_args.update(extra_args) - logging.info('attacks args={}'.format(attack_args)) + logging.info("attacks args={}".format(attack_args)) attack = AttackFactory.create(model, **attack_args) return attack @@ -87,25 +93,30 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") - train_mode = kwargs['train_mode'] + train_mode = kwargs["train_mode"] ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) attack = init_attack(model, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, attack, - device=device, metrics=metrics, - ddp=world_size>1, - train_mode=train_mode, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + attack, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -114,9 +125,9 @@ def train_xvec(gpu_id, args): # def train_xvec(data_rspec, train_list, val_list, in_model_path, -# attack_eps, attack_eps_step, attack_random_eps, +# attack_eps, attack_eps_step, attack_random_eps, # attack_max_iter, p_attack, -# num_gpus, resume, num_workers, +# num_gpus, resume, num_workers, # train_mode, **kwargs): # set_float_cpu('float32') @@ -166,7 +177,7 @@ def train_xvec(gpu_id, args): # attack = PGDAttack(model, eps=attack_eps, alpha=attack_eps_step, # norm=float('inf'), max_iter=attack_max_iter, # random_eps=attack_random_eps) - + # trainer = Trainer(model, optimizer, attack, p_attack=p_attack, # device=device, metrics=metrics, lr_scheduler=lr_sch, # data_parallel=(num_gpus>1), train_mode=train_mode, @@ -176,51 +187,61 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) +if __name__ == "__main__": -if __name__ == '__main__': - - parser = ArgumentParser( - description='Fine-tune x-vector model with adv training') + parser = ArgumentParser(description="Fine-tune x-vector model with adv training") - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-model-path', required=True) + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument("--in-model-path", required=True) XVec.add_finetune_args(parser) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") Trainer.add_class_args(parser) ddp.add_ddp_args(parser) # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--train-mode', default='ft-embed-affine', - choices=['ft-full', 'ft-embed-affine'], - help=('ft-full: adapt full x-vector network' - 'ft-embed-affine: adapt affine transform before embedding')) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) # parser.add_argument('--attack-eps', required=True, type=float, # help='epsilon adversarial attack') # parser.add_argument('--attack-eps-step', required=True, type=float, # help='eps step adversarial attack') - # parser.add_argument('--attack-random-eps', default=False, + # parser.add_argument('--attack-random-eps', default=False, # action='store_true', # help='use random eps in adv. attack') - # parser.add_argument('--attack-max-iter', default=10, type=int, + # parser.add_argument('--attack-max-iter', default=10, type=int, # help='number of iterations for adversarial optimization') - # parser.add_argument('--p-attack', default=0.5, type=float, + # parser.add_argument('--p-attack', default=0.5, type=float, # help='ratio of batches with adv attack') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -231,4 +252,3 @@ def train_xvec(gpu_id, args): del args.seed train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-compute-mfcc-feats.py b/hyperion/bin/torch-compute-mfcc-feats.py index 10bc6be5..5f7d9f7d 100755 --- a/hyperion/bin/torch-compute-mfcc-feats.py +++ b/hyperion/bin/torch-compute-mfcc-feats.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -19,62 +24,84 @@ from hyperion.torch.layers import AudioFeatsFactory as AFF from hyperion.feats import MFCC -def compute_mfcc_feats(input_path, output_path, - compress, compression_method, write_num_frames, **kwargs): + +def compute_mfcc_feats( + input_path, output_path, compress, compression_method, write_num_frames, **kwargs +): mfcc_args = AFF.filter_args(**kwargs) mfcc = AFF.create(**mfcc_args) input_args = AR.filter_args(**kwargs) reader = AR(input_path, **input_args) - writer = DWF.create(output_path, scp_sep=' ', - compress=compress, - compression_method=compression_method) + writer = DWF.create( + output_path, + scp_sep=" ", + compress=compress, + compression_method=compression_method, + ) if write_num_frames is not None: - f_num_frames = open(write_num_frames, 'w') - + f_num_frames = open(write_num_frames, "w") + for data in reader: key, x, fs = data - logging.info('Extracting MFCC for %s' % (key)) + logging.info("Extracting MFCC for %s" % (key)) t1 = time.time() - x = torch.tensor(x[None,:], dtype=torch.get_default_dtype()) + x = torch.tensor(x[None, :], dtype=torch.get_default_dtype()) y = mfcc(x).squeeze(0).detach().numpy() - dt = (time.time() - t1)*1000 - rtf = mfcc.frame_shift*y.shape[0]/dt - logging.info('Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f' % - (key, y.shape[0], dt, rtf)) + dt = (time.time() - t1) * 1000 + rtf = mfcc.frame_shift * y.shape[0] / dt + logging.info( + "Extracted MFCC for %s num-frames=%d elapsed-time=%.2f ms. real-time-factor=%.2f" + % (key, y.shape[0], dt, rtf) + ) writer.write([key], [y]) - + if write_num_frames is not None: - f_num_frames.write('%s %d\n' % (key, y.shape[0])) - - + f_num_frames.write("%s %d\n" % (key, y.shape[0])) + if write_num_frames is not None: f_num_frames.close() - + if __name__ == "__main__": - - parser=ArgumentParser( - description='Compute MFCC features in pytorch') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_path', required=True) - parser.add_argument('--output', dest='output_path', required=True) - parser.add_argument('--write-num-frames', dest='write_num_frames', default=None) + parser = ArgumentParser(description="Compute MFCC features in pytorch") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_path", required=True) + parser.add_argument("--output", dest="output_path", required=True) + parser.add_argument("--write-num-frames", dest="write_num_frames", default=None) AR.add_class_args(parser) AFF.add_class_args(parser) - parser.add_argument('--compress', dest='compress', default=False, action='store_true', help='Compress the features') - parser.add_argument('--compression-method', dest='compression_method', default='auto', - choices=compression_methods, help='Compression method') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int, - help='Verbose level') - args=parser.parse_args() + parser.add_argument( + "--compress", + dest="compress", + default=False, + action="store_true", + help="Compress the features", + ) + parser.add_argument( + "--compression-method", + dest="compression_method", + default="auto", + choices=compression_methods, + help="Compression method", + ) + parser.add_argument( + "-v", + "--verbose", + dest="verbose", + default=1, + choices=[0, 1, 2, 3], + type=int, + help="Verbose level", + ) + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + compute_mfcc_feats(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-eval-vae.py b/hyperion/bin/torch-eval-vae.py index 5f60a8bc..dfcdaa38 100755 --- a/hyperion/bin/torch-eval-vae.py +++ b/hyperion/bin/torch-eval-vae.py @@ -6,13 +6,19 @@ import time import logging from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import numpy as np import pandas as pd import matplotlib -matplotlib.use('Agg') -#matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) + +matplotlib.use("Agg") +# matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) import matplotlib.pyplot as plt import torch @@ -28,17 +34,18 @@ from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_mvn(device, **kwargs): - mvn_args = MVN.filter_args(**kwargs['mvn']) - logging.info('mvn args={}'.format(mvn_args)) + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) mvn = MVN(**mvn_args) if mvn.norm_mean or mvn.norm_var: return mvn @@ -46,9 +53,9 @@ def init_mvn(device, **kwargs): def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('vae-model={}'.format(model)) + logging.info("vae-model={}".format(model)) model.to(device) model.eval() return model @@ -59,40 +66,52 @@ def write_img(output_dir, key, x, x_mean, x_sample, num_frames): vmax = np.max(x) vmin = np.min(x) if x.shape[1] > num_frames: - x = x[:,:num_frames] + x = x[:, :num_frames] x_mean = x_mean[:, :num_frames] - x_sample = x_sample[:,:num_frames] + x_sample = x_sample[:, :num_frames] elif x.shape[1] < num_frames: - x_extra = vmin * np.ones((x.shape[0], num_frames - x.shape[1]), dtype=float_cpu()) + x_extra = vmin * np.ones( + (x.shape[0], num_frames - x.shape[1]), dtype=float_cpu() + ) x = np.concatenate((x, x_extra), axis=1) x_mean = np.concatenate((x_mean, x_extra), axis=1) x_sample = np.concatenate((x_sample, x_extra), axis=1) - - cmap = plt.get_cmap('jet') - plt.subplots_adjust(left=0.05, bottom=0.05, right=0.95, top=0.95, wspace=0.1, hspace=0.1) - plt.figure(figsize=(12,8), dpi=300) - plt.subplot(3,1,1) + + cmap = plt.get_cmap("jet") + plt.subplots_adjust( + left=0.05, bottom=0.05, right=0.95, top=0.95, wspace=0.1, hspace=0.1 + ) + plt.figure(figsize=(12, 8), dpi=300) + plt.subplot(3, 1, 1) plt.imshow(x, aspect=2, cmap=cmap, vmax=vmax, vmin=vmin) - plt.subplot(3,1,2) + plt.subplot(3, 1, 2) plt.imshow(x_mean, aspect=2, cmap=cmap, vmax=vmax, vmin=vmin) - plt.subplot(3,1,3) + plt.subplot(3, 1, 3) plt.imshow(x_sample, aspect=2, cmap=cmap, vmax=vmax, vmin=vmin) - file_path = Path(output_dir, key + '.pdf') + file_path = Path(output_dir, key + ".pdf") plt.savefig(file_path) plt.close() - - - -def eval_vae(input_spec, vad_spec, write_num_frames_spec, - vad_path_prefix, - model_path, score_path, - write_x_mean_spec, write_x_sample_spec, write_z_sample_spec, - write_img_path, img_frames, - use_gpu, **kwargs): - - logging.info('initializing') - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) + + +def eval_vae( + input_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + model_path, + score_path, + write_x_mean_spec, + write_x_sample_spec, + write_z_sample_spec, + write_img_path, + img_frames, + use_gpu, + **kwargs +): + + logging.info("initializing") + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) @@ -108,80 +127,81 @@ def eval_vae(input_spec, vad_spec, write_num_frames_spec, # model = TML.load(model_path) # model.to(device) # model.eval() - - x_mean_writer=None - x_sample_writer=None - z_sample_writer=None - fargs = {'return_x_mean': True} #args for forward function + + x_mean_writer = None + x_sample_writer = None + z_sample_writer = None + fargs = {"return_x_mean": True} # args for forward function if write_x_mean_spec is not None: - logging.info('opening write x-mean stream: %s' % (write_x_mean_spec)) - x_mean_writer = DWF.create(write_x_mean_spec) + logging.info("opening write x-mean stream: %s" % (write_x_mean_spec)) + x_mean_writer = DWF.create(write_x_mean_spec) if write_x_sample_spec is not None: - logging.info('opening write x-sample stream: %s' % (write_x_sample_spec)) + logging.info("opening write x-sample stream: %s" % (write_x_sample_spec)) x_sample_writer = DWF.create(write_x_sample_spec) - fargs['return_x_sample'] = True + fargs["return_x_sample"] = True if write_z_sample_spec is not None: - logging.info('opening write z-sample stream: %s' % (write_z_sample_spec)) + logging.info("opening write z-sample stream: %s" % (write_z_sample_spec)) z_sample_writer = DWF.create(write_z_sample_spec) - fargs['return_z_sample'] = True + fargs["return_z_sample"] = True if write_img_path is not None: - logging.info('making img dir: %s' % (write_img_path)) - fargs['return_x_mean'] = True - fargs['return_x_sample'] = True + logging.info("making img dir: %s" % (write_img_path)) + fargs["return_x_mean"] = True + fargs["return_x_sample"] = True Path(write_img_path).mkdir(parents=True, exist_ok=True) - metrics = ['loss', 'elbo', 'log_px', 'kldiv_z', - 'vq_loss', 'log_perplexity'] - extra_metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } + metrics = ["loss", "elbo", "log_px", "kldiv_z", "vq_loss", "log_perplexity"] + extra_metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} scores_df = [] dr_args = DRF.filter_args(**kwargs) - logging.info('opening input stream: %s' % (input_spec)) + logging.info("opening input stream: %s" % (input_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) + logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - + while not reader.eof(): t1 = time.time() key, data = reader.read(1) if len(key) == 0: break t2 = time.time() - logging.info('processing utt %s' % (key[0])) + logging.info("processing utt %s" % (key[0])) x = data[0] if mvn is not None: x = mvn.normalize(x) t3 = time.time() tot_frames = x.shape[0] if vad_spec is not None: - vad = v_reader.read( - key, num_frames=x.shape[0])[0].astype( - 'bool', copy=False) + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) x = x[vad] - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % ( - key[0], x.shape[0], tot_frames, x.shape[0]/tot_frames*100)) - + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + t4 = time.time() - scores = {'key': key[0]} + scores = {"key": key[0]} if x.shape[0] == 0: x_mean, x_sample = np.zeros((1, x.shape[1]), dtype=float_cpu()) z_sample = np.zeros((1, model.z_dim), dtype=float_cpu()) else: - xx = torch.tensor(x.T[None,:], dtype=torch.get_default_dtype()) + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): xx = xx.to(device) output = model(xx, **fargs) - x_mean = output['x_mean'] + x_mean = output["x_mean"] for metric in metrics: if metric in output: scores[metric] = output[metric].mean().item() - + for metric in extra_metrics.keys(): scores[metric] = extra_metrics[metric](x_mean, xx).item() @@ -191,21 +211,20 @@ def eval_vae(input_spec, vad_spec, write_num_frames_spec, # mse = nn.functional.mse_loss(px.mean, xx).item() # l1 = nn.functional.l1_loss(px.mean, xx).item() - logging.info('utt {} scores={}'.format(key[0], scores)) - - #logging.info('utt %s elbo=%.2f E[logP(x|z)]=%.2f KL(q(z)||p(z))=%.2f mse=%.2f l1=%.2f' % ( + logging.info("utt {} scores={}".format(key[0], scores)) + + # logging.info('utt %s elbo=%.2f E[logP(x|z)]=%.2f KL(q(z)||p(z))=%.2f mse=%.2f l1=%.2f' % ( # key[0], elbo, log_px, kldiv_z, mse, l1)) - + x_mean = x_mean.cpu().numpy()[0] - if 'x_sample' in output: - x_sample = output['x_sample'].cpu().numpy()[0] - if 'z_sample' in output: - z_sample = output['z_mean'].cpu().numpy()[0] + if "x_sample" in output: + x_sample = output["x_sample"].cpu().numpy()[0] + if "z_sample" in output: + z_sample = output["z_mean"].cpu().numpy()[0] if write_img_path: - write_img(write_img_path, key[0], x.T, - x_mean, x_sample, img_frames) - + write_img(write_img_path, key[0], x.T, x_mean, x_sample, img_frames) + t5 = time.time() scores_df.append(pd.DataFrame(scores, index=[0])) if x_mean_writer is not None: @@ -219,70 +238,111 @@ def eval_vae(input_spec, vad_spec, write_num_frames_spec, keys.append(key[0]) info.append(str(x.shape[0])) t6 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f ' - 'vad-time=%.3f vae-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key[0], t6-t1, t2-t1, t3-t2, t4-t3, - t5-t4, t6-t5, x.shape[0]*1e-2/(t6-t1))) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f vae-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) scores_df = pd.concat(scores_df, ignore_index=True) - scores_df.to_csv(score_path, index=False, na_rep='n/a') + scores_df.to_csv(score_path, index=False, na_rep="n/a") if write_num_frames_spec is not None: - logging.info('writing num-frames to %s' % (write_num_frames_spec)) + logging.info("writing num-frames to %s" % (write_num_frames_spec)) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) - - + + if __name__ == "__main__": - - parser = ArgumentParser( - description='Extract x-vectors with pytorch model') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) + parser = ArgumentParser(description="Extract x-vectors with pytorch model") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) DRF.add_class_args(parser) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-num-frames', dest='write_num_frames_spec', default=None) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) # parser.add_argument('--scp-sep', dest='scp_sep', default=' ', # help=('scp file field separator')) # parser.add_argument('--path-prefix', dest='path_prefix', default=None, # help=('scp file_path prefix')) - parser.add_argument('--vad-path-prefix', dest='vad_path_prefix', default=None, - help=('scp file_path prefix for vad')) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - MVN.add_class_args(parser, prefix='mvn') + MVN.add_class_args(parser, prefix="mvn") - parser.add_argument('--model-path', required=True) - # parser.add_argument('--chunk-length', type=int, default=0, + parser.add_argument("--model-path", required=True) + # parser.add_argument('--chunk-length', type=int, default=0, # help=('number of frames used in each forward pass of the x-vector encoder,' # 'if 0 the full utterance is used')) - parser.add_argument('--write-x-mean', dest='write_x_mean_spec', default=None, - help='write-specifier for the mean of P(x|z)') - parser.add_argument('--write-x-sample', dest='write_x_sample_spec', default=None, - help='write-specifier for samples drawn from x ~ P(x|z)') - parser.add_argument('--write-z-sample', dest='write_z_sample_spec', default=None, - help='write-specifier for samples drawn from z ~ Q(z|x)') - parser.add_argument('--write-img-path', default=None, - help='output directory to save spectrogram images in pdf format') - parser.add_argument('--img-frames', default=400, type=int, - help='number of frames to plot in the images') - parser.add_argument('--scores', dest='score_path', required=True, - help='output file to write ELBO and other metrics') - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--write-x-mean", + dest="write_x_mean_spec", + default=None, + help="write-specifier for the mean of P(x|z)", + ) + parser.add_argument( + "--write-x-sample", + dest="write_x_sample_spec", + default=None, + help="write-specifier for samples drawn from x ~ P(x|z)", + ) + parser.add_argument( + "--write-z-sample", + dest="write_z_sample_spec", + default=None, + help="write-specifier for samples drawn from z ~ Q(z|x)", + ) + parser.add_argument( + "--write-img-path", + default=None, + help="output directory to save spectrogram images in pdf format", + ) + parser.add_argument( + "--img-frames", + default=400, + type=int, + help="number of frames to plot in the images", + ) + parser.add_argument( + "--scores", + dest="score_path", + required=True, + help="output file to write ELBO and other metrics", + ) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) # parser.add_argument('--part-idx', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) # parser.add_argument('--num-parts', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) eval_vae(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py index 59b8a572..8d55b719 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav-wavegan.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -40,15 +45,17 @@ class MyModel(nn.Module): - def __init__(self, - feat_extractor, - xvector_model, - embed_layer=None, - calibrator=None, - sigma=0, - smoothing_after_wavegan=None, - wave_gan_defender=None, - wav_scale=2**15 - 1): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + sigma=0, + smoothing_after_wavegan=None, + wave_gan_defender=None, + wav_scale=2 ** 15 - 1, + ): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model @@ -92,8 +99,7 @@ def forward(self, s_t): f_t = f_t[:, self.vad_t] f_t = f_t.transpose(1, 2).contiguous() - x_t = self.xvector_model.extract_embed(f_t, - embed_layer=self.embed_layer) + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) score = torch.sum(x_e * x_t, dim=-1) @@ -116,36 +122,36 @@ def fix_out_of_memory(model, tensors): def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(**kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() return feat_extractor def load_model(model_path): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.eval() return model def load_calibrator(cal_file, threshold): - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) - #subting the threshold here will put the decision threshold in 0 - #some attacks use thr=0 to decide if the attack is succesful + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) calibrator.eval() return calibrator @@ -166,20 +172,38 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): return key, x_e -def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, - vad_spec, vad_path_prefix, model_path, - embed_layer, score_file, stats_file, cal_file, - threshold, smooth_sigma, max_test_length, - save_adv_wav, save_adv_wav_path, use_gpu, - seg_part_idx, num_seg_parts, - smoothing_after_wavegan, wave_gan_root_dir, - wave_gan_model_ckpt, **kwargs): +def eval_cosine_scoring_wavegan( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + smooth_sigma, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + smoothing_after_wavegan, + wave_gan_root_dir, + wave_gan_model_ckpt, + **kwargs +): device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) - wave_gan_defender = WaveGANDefender(Path(wave_gan_root_dir), - Path(wave_gan_model_ckpt)) + wave_gan_defender = WaveGANDefender( + Path(wave_gan_root_dir), Path(wave_gan_model_ckpt) + ) xvector_model = load_model(model_path) calibrator = None @@ -189,9 +213,8 @@ def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, tar = torch.as_tensor([1], dtype=torch.float).to(device) non = torch.as_tensor([0], dtype=torch.float).to(device) - logging.info('loading key and enrollment x-vectors') - key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) audio_args = AR.filter_args(**kwargs) @@ -199,41 +222,57 @@ def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, wav_scale = audio_reader.wav_scale if save_adv_wav: - tar_audio_writer = AW(save_adv_wav_path + '/tar2non') - non_audio_writer = AW(save_adv_wav_path + '/non2tar') - - model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator, - smooth_sigma, smoothing_after_wavegan, wave_gan_defender, - wav_scale) + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") + + model = MyModel( + feat_extractor, + xvector_model, + embed_layer, + calibrator, + smooth_sigma, + smoothing_after_wavegan, + wave_gan_defender, + wav_scale, + ) model.to(device) model.eval() - attack_args = AttackFactory.filter_args(**kwargs['attack']) + attack_args = AttackFactory.filter_args(**kwargs["attack"]) extra_args = { - 'eps_scale': wav_scale, - 'range_min': -wav_scale, - 'range_max': wav_scale, - 'loss': nn.functional.binary_cross_entropy_with_logits, - 'time_dim': 1 + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, } attack_args.update(extra_args) - logging.info('attacks args={}'.format(attack_args)) + logging.info("attacks args={}".format(attack_args)) attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create(vad_spec, - path_prefix=vad_path_prefix, - scp_sep=' ') - - scores = np.zeros((key.num_models, key.num_tests), dtype='float32') - attack_stats = pd.DataFrame(columns=[ - 'modelid', 'segmentid', 'snr', 'px', 'pn', 'x_l2', 'x_linf', 'n_l0', - 'n_l2', 'n_linf', 'num_frames' - ]) + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) for j in range(key.num_tests): t1 = time.time() - logging.info('scoring test utt %s' % (key.seg_set[j])) + logging.info("scoring test utt %s" % (key.seg_set[j])) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -244,21 +283,25 @@ def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, s = s[:max_samples] s_cpu = s[None, :] - s = torch.as_tensor(s_cpu, - dtype=torch.get_default_dtype(), - device=device) + s = torch.as_tensor(s_cpu, dtype=torch.get_default_dtype(), device=device) if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool, - device=device) + vad = torch.as_tensor( + vad.astype(np.bool, copy=False), dtype=torch.bool, device=device + ) model.vad_t = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % - (key.seg_set[j], speech_frames, tot_frames, - speech_frames / tot_frames * 100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) t2 = time.time() @@ -284,34 +327,33 @@ def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, scores[i, j] = model(s_adv) + threshold t4 = time.time() - trial_time += (t4 - t3) + trial_time += t4 - t3 num_trials += 1 s_adv = s_adv.detach() stats_ij = compute_stats_adv_attack(s, s_adv) - stats_ij = [ - stat.detach().cpu().numpy()[0] for stat in stats_ij - ] + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] attack_stats = attack_stats.append( { - 'modelid': key.model_set[i], - 'segmentid': key.seg_set[j], - 'snr': stats_ij[0], - 'px': stats_ij[1], - 'pn': stats_ij[2], - 'x_l2': stats_ij[3], - 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], - 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1] + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], }, - ignore_index=True) + ignore_index=True, + ) - #logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) if save_adv_wav: s_adv = s_adv.cpu().numpy()[0] - trial_name = '%s-%s' % (key.model_set[i], key.seg_set[j]) + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) if key.tar[i, j] and scores[i, j] < threshold: tar_audio_writer.write(trial_name, s_adv, fs) elif key.non[i, j] and scores[i, j] > threshold: @@ -319,86 +361,97 @@ def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, trial_time /= num_trials t7 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d ' - 'rt-factor=%.5f'), key.seg_set[j], t7 - t1, t2 - t1, trial_time, - num_trials, (t7 - t1) / (num_trials * s.shape[1] / fs)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) if num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, 1, seg_part_idx) - stats_file = '%s-%03d-%03d' % (stats_file, 1, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) - s = TrialScores(key.model_set, - key.seg_set, - scores, - score_mask=np.logical_or(key.tar, key.non)) + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) s.save_txt(score_file) - logging.info('saving stats to %s' % (stats_file)) + logging.info("saving stats to %s" % (stats_file)) attack_stats.to_csv(stats_file) if __name__ == "__main__": parser = ArgumentParser( - description='Eval cosine-scoring given enroll x-vector and test wave') + description="Eval cosine-scoring given enroll x-vector and test wave" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--v-file', dest='v_file', required=True) - parser.add_argument('--key-file', dest='key_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-wav-file', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--key-file", dest="key_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', - dest='vad_path_prefix', - default=None, - help=('scp file_path prefix for vad')) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - parser.add_argument('--model-path', required=True) + parser.add_argument("--model-path", required=True) parser.add_argument( - '--embed-layer', + "--embed-layer", type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) - parser.add_argument('--use-gpu', - default=False, - action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") - parser.add_argument('--seg-part-idx', - default=1, - type=int, - help=('test part index')) + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) parser.add_argument( - '--num-seg-parts', + "--num-seg-parts", default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) - - parser.add_argument('--save-adv-wav', - default=False, - action='store_true', - help='save adversarial signals to disk') - parser.add_argument('--save-adv-wav-path', - default=None, - help='output path of adv signals') + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) # parser.add_argument('--save-adv-wav-tar-thr', # default=0.75, type=float, @@ -408,40 +461,42 @@ def eval_cosine_scoring_wavegan(v_file, key_file, enroll_file, test_wav_file, # default=-0.75, type=float, # help='max score to save signal from attack that makes tar into non-tar') - parser.add_argument('--stats-file', - default=None, - help='output path of to save stats of adv signals') - - parser.add_argument('--cal-file', - default=None, - help='score calibration file') - parser.add_argument('--threshold', - default=0, - type=float, - help='decision threshold') - parser.add_argument('--smooth-sigma', - default=0, - type=float, - help='sigma for smoothing') - parser.add_argument('--max-test-length', - default=5, - type=float, - help=('maximum length (secs) for the test side, ' - 'this is to avoid GPU memory errors')) + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--smooth-sigma", default=0, type=float, help="sigma for smoothing" + ) + parser.add_argument( + "--max-test-length", + default=5, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) # Defense: WaveGAN specific arguments [Added Sonal May21] - parser.add_argument('--smoothing-after-wavegan', - default=False, - action='store_true', - help=('Smoothing before or after wavegan, if true: ' - 'smoothing is done after wavegan')) - - parser.add_argument('--wave-gan-root-dir', - default=None, - help='WaveGAN model root directory') - parser.add_argument('--wave-gan-model-ckpt', - default=None, - help='WaveGAN model checkpoint') + parser.add_argument( + "--smoothing-after-wavegan", + default=False, + action="store_true", + help=( + "Smoothing before or after wavegan, if true: " + "smoothing is done after wavegan" + ), + ) + + parser.add_argument( + "--wave-gan-root-dir", default=None, help="WaveGAN model root directory" + ) + parser.add_argument( + "--wave-gan-model-ckpt", default=None, help="WaveGAN model checkpoint" + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py index 6c3e54b9..a5783654 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-adv-test-wav.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -34,12 +39,9 @@ class MyModel(nn.Module): - def __init__(self, - feat_extractor, - xvector_model, - embed_layer=None, - calibrator=None, - sigma=0): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None, sigma=0 + ): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model @@ -66,8 +68,7 @@ def forward(self, s_t): f_t = f_t[:, self.vad_t] f_t = f_t.transpose(1, 2).contiguous() - x_t = self.xvector_model.extract_embed(f_t, - embed_layer=self.embed_layer) + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) score = torch.sum(x_e * x_t, dim=-1) @@ -78,36 +79,36 @@ def forward(self, s_t): def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(**kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() return feat_extractor def load_model(model_path): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.eval() return model def load_calibrator(cal_file, threshold): - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) - #subting the threshold here will put the decision threshold in 0 - #some attacks use thr=0 to decide if the attack is succesful + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) calibrator.eval() return calibrator @@ -128,11 +129,28 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): return key, x_e -def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, - vad_path_prefix, model_path, embed_layer, score_file, - stats_file, cal_file, threshold, smooth_sigma, - max_test_length, save_adv_wav, save_adv_wav_path, - use_gpu, seg_part_idx, num_seg_parts, **kwargs): +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + smooth_sigma, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) @@ -145,9 +163,8 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, tar = torch.as_tensor([1], dtype=torch.float).to(device) non = torch.as_tensor([0], dtype=torch.float).to(device) - logging.info('loading key and enrollment x-vectors') - key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) audio_args = AR.filter_args(**kwargs) @@ -155,40 +172,50 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, wav_scale = audio_reader.wav_scale if save_adv_wav: - tar_audio_writer = AW(save_adv_wav_path + '/tar2non') - non_audio_writer = AW(save_adv_wav_path + '/non2tar') + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") smooth_sigma *= wav_scale - model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator, - smooth_sigma) + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, smooth_sigma + ) model.to(device) model.eval() - attack_args = AttackFactory.filter_args(**kwargs['attack']) + attack_args = AttackFactory.filter_args(**kwargs["attack"]) extra_args = { - 'eps_scale': wav_scale, - 'range_min': -wav_scale, - 'range_max': wav_scale, - 'loss': nn.functional.binary_cross_entropy_with_logits, - 'time_dim': 1 + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, } attack_args.update(extra_args) - logging.info('attacks args={}'.format(attack_args)) + logging.info("attacks args={}".format(attack_args)) attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: - logging.info('opening VAD stream: %s', vad_spec) - v_reader = VRF.create(vad_spec, - path_prefix=vad_path_prefix, - scp_sep=' ') - - scores = np.zeros((key.num_models, key.num_tests), dtype='float32') - attack_stats = pd.DataFrame(columns=[ - 'modelid', 'segmentid', 'snr', 'px', 'pn', 'x_l2', 'x_linf', 'n_l0', - 'n_l2', 'n_linf', 'num_frames' - ]) + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) for j in range(key.num_tests): t1 = time.time() - logging.info('scoring test utt %s', key.seg_set[j]) + logging.info("scoring test utt %s", key.seg_set[j]) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -197,19 +224,25 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, if len(s) > max_samples: s = s[:max_samples] - s = torch.as_tensor(s[None, :], - dtype=torch.get_default_dtype()).to(device) + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool).to(device) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) model.vad_t = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % - (key.seg_set[j], speech_frames, tot_frames, - speech_frames / tot_frames * 100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) t2 = time.time() @@ -236,34 +269,33 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, scores[i, j] = model(s_adv) + threshold t4 = time.time() - trial_time += (t4 - t3) + trial_time += t4 - t3 num_trials += 1 s_adv = s_adv.detach() stats_ij = compute_stats_adv_attack(s, s_adv) - stats_ij = [ - stat.detach().cpu().numpy()[0] for stat in stats_ij - ] + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] attack_stats = attack_stats.append( { - 'modelid': key.model_set[i], - 'segmentid': key.seg_set[j], - 'snr': stats_ij[0], - 'px': stats_ij[1], - 'pn': stats_ij[2], - 'x_l2': stats_ij[3], - 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], - 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1] + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], }, - ignore_index=True) + ignore_index=True, + ) - #logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) if save_adv_wav: s_adv = s_adv.cpu().numpy()[0] - trial_name = '%s-%s' % (key.model_set[i], key.seg_set[j]) + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) if key.tar[i, j] and scores[i, j] < threshold: tar_audio_writer.write(trial_name, s_adv, fs) elif key.non[i, j] and scores[i, j] > threshold: @@ -271,86 +303,97 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, trial_time /= num_trials t7 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.4f trial-time=%.4f n_trials=%d ' - 'rt-factor=%.5f'), key.seg_set[j], t7 - t1, t2 - t1, trial_time, - num_trials, (t7 - t1) / (num_trials * s.shape[1] / fs)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.4f trial-time=%.4f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) if num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, 1, seg_part_idx) - stats_file = '%s-%03d-%03d' % (stats_file, 1, seg_part_idx) - logging.info('saving scores to %s', score_file) - s = TrialScores(key.model_set, - key.seg_set, - scores, - score_mask=np.logical_or(key.tar, key.non)) + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) s.save_txt(score_file) - logging.info('saving stats to %s' % (stats_file)) + logging.info("saving stats to %s" % (stats_file)) attack_stats.to_csv(stats_file) if __name__ == "__main__": parser = ArgumentParser( - description='Eval cosine-scoring given enroll x-vector and test wave') + description="Eval cosine-scoring given enroll x-vector and test wave" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--v-file', dest='v_file', required=True) - parser.add_argument('--key-file', dest='key_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-wav-file', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--key-file", dest="key_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', - dest='vad_path_prefix', - default=None, - help=('scp file_path prefix for vad')) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - parser.add_argument('--model-path', required=True) + parser.add_argument("--model-path", required=True) parser.add_argument( - '--embed-layer', + "--embed-layer", type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) - parser.add_argument('--use-gpu', - default=False, - action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") - parser.add_argument('--seg-part-idx', - default=1, - type=int, - help=('test part index')) + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) parser.add_argument( - '--num-seg-parts', + "--num-seg-parts", default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) - - parser.add_argument('--save-adv-wav', - default=False, - action='store_true', - help='save adversarial signals to disk') - parser.add_argument('--save-adv-wav-path', - default=None, - help='output path of adv signals') + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) # parser.add_argument('--save-adv-wav-tar-thr', # default=0.75, type=float, @@ -360,26 +403,24 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, # default=-0.75, type=float, # help='max score to save signal from attack that makes tar into non-tar') - parser.add_argument('--stats-file', - default=None, - help='output path of to save stats of adv signals') - - parser.add_argument('--cal-file', - default=None, - help='score calibration file') - parser.add_argument('--threshold', - default=0, - type=float, - help='decision threshold') - parser.add_argument('--smooth-sigma', - default=0, - type=float, - help='sigma for smoothing') - parser.add_argument('--max-test-length', - default=None, - type=float, - help=('maximum length (secs) for the test side, ' - 'this is to avoid GPU memory errors')) + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--smooth-sigma", default=0, type=float, help="sigma for smoothing" + ) + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py index 88e65684..44a3b98f 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-art-test-wav.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -32,37 +37,39 @@ from hyperion.torch import TorchModelLoader as TML from art.classifiers import PyTorchClassifier -from hyperion.torch.adv_attacks.art_attack_factory import ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(**kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() return feat_extractor def load_model(model_path): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.eval() return model def load_calibrator(cal_file): - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) calibrator = Calibrator(lr.A[0, 0], lr.b[0]) calibrator.eval() @@ -85,12 +92,14 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): class MyModel(nn.Module): - def __init__(self, - feat_extractor, - xvector_model, - embed_layer=None, - calibrator=None, - threshold=0): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + threshold=0, + ): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model @@ -114,8 +123,7 @@ def forward(self, s_t): f_t = f_t[:, self.vad_t] f_t = f_t.transpose(1, 2).contiguous() - x_t = self.xvector_model.extract_embed(f_t, - embed_layer=self.embed_layer) + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) @@ -123,17 +131,33 @@ def forward(self, s_t): score = self.calibrator(tar_score) non_score = self.threshold + 0 * tar_score - score = torch.cat((non_score, tar_score), dim=-1) #.unsqueeze(0) + score = torch.cat((non_score, tar_score), dim=-1) # .unsqueeze(0) return score -def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, - vad_path_prefix, model_path, embed_layer, score_file, - stats_file, cal_file, threshold, save_adv_wav, - save_adv_wav_path, max_test_length, use_gpu, - seg_part_idx, num_seg_parts, **kwargs): - - device_type = 'gpu' if use_gpu else 'cpu' +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + stats_file, + cal_file, + threshold, + save_adv_wav, + save_adv_wav_path, + max_test_length, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) feat_extractor = init_feats(**kwargs) xvector_model = load_model(model_path) @@ -142,20 +166,17 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, if cal_file is not None: calibrator = load_calibrator(cal_file) - model = MyModel(feat_extractor, - xvector_model, - embed_layer, - calibrator, - threshold=threshold) + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, threshold=threshold + ) model.to(device) model.eval() tar = np.asarray([1], dtype=np.int) non = np.asarray([0], dtype=np.int) - logging.info('loading key and enrollment x-vectors') - key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) audio_args = AR.filter_args(**kwargs) @@ -163,29 +184,38 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, wav_scale = audio_reader.wav_scale if save_adv_wav: - tar_audio_writer = AW(save_adv_wav_path + '/tar2non') - non_audio_writer = AW(save_adv_wav_path + '/non2tar') + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") - attack_args = AttackFactory.filter_args(**kwargs['attack']) - extra_args = {'eps_scale': wav_scale} + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": wav_scale} attack_args.update(extra_args) - logging.info('attack-args={}'.format(attack_args)) + logging.info("attack-args={}".format(attack_args)) if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create(vad_spec, - path_prefix=vad_path_prefix, - scp_sep=' ') - - scores = np.zeros((key.num_models, key.num_tests), dtype='float32') - attack_stats = pd.DataFrame(columns=[ - 'modelid', 'segmentid', 'snr', 'px', 'pn', 'x_l2', 'x_linf', 'n_l0', - 'n_l2', 'n_linf', 'num_frames' - ]) + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) for j in range(key.num_tests): t1 = time.time() - logging.info('scoring test utt %s' % (key.seg_set[j])) + logging.info("scoring test utt %s" % (key.seg_set[j])) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -195,34 +225,42 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, if len(s) > max_samples: s = s[:max_samples] - s = s[None, :].astype('float32', copy=False) - s_tensor = torch.as_tensor(s, - dtype=torch.get_default_dtype()).to(device) + s = s[None, :].astype("float32", copy=False) + s_tensor = torch.as_tensor(s, dtype=torch.get_default_dtype()).to(device) if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool).to(device) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) model.vad_t = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % - (key.seg_set[j], speech_frames, tot_frames, - speech_frames / tot_frames * 100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) t2 = time.time() trial_time = 0 num_trials = 0 - model_art = PyTorchClassifier(model=model, - loss=nn.CrossEntropyLoss(), - optimizer=None, - input_shape=[1, s.shape[1]], - nb_classes=2, - clip_values=(-wav_scale, wav_scale), - device_type=device_type) - - attack_args['num_samples'] = s.shape[-1] + model_art = PyTorchClassifier( + model=model, + loss=nn.CrossEntropyLoss(), + optimizer=None, + input_shape=[1, s.shape[1]], + nb_classes=2, + clip_values=(-wav_scale, wav_scale), + device_type=device_type, + ) + + attack_args["num_samples"] = s.shape[-1] attack = AttackFactory.create(model_art, **attack_args) for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: @@ -245,34 +283,33 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, scores[i, j] = model(s_adv).cpu().numpy()[0, 1] t4 = time.time() - trial_time += (t4 - t3) + trial_time += t4 - t3 num_trials += 1 s_adv = s_adv.detach() stats_ij = compute_stats_adv_attack(s_tensor, s_adv) - stats_ij = [ - stat.detach().cpu().numpy()[0] for stat in stats_ij - ] + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] attack_stats = attack_stats.append( { - 'modelid': key.model_set[i], - 'segmentid': key.seg_set[j], - 'snr': stats_ij[0], - 'px': stats_ij[1], - 'pn': stats_ij[2], - 'x_l2': stats_ij[3], - 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], - 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1] + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], }, - ignore_index=True) + ignore_index=True, + ) - #logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) if save_adv_wav: s_adv = s_adv.cpu().numpy()[0] - trial_name = '%s-%s' % (key.model_set[i], key.seg_set[j]) + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) if key.tar[i, j] and scores[i, j] < threshold: tar_audio_writer.write(trial_name, s_adv, fs) elif key.non[i, j] and scores[i, j] > threshold: @@ -282,104 +319,116 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, del model_art trial_time /= num_trials t7 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d ' - 'rt-factor=%.5f'), key.seg_set[j], t7 - t1, t2 - t1, trial_time, - num_trials, (t7 - t1) / (num_trials * s.shape[1] / fs)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.5f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) if num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, 1, seg_part_idx) - stats_file = '%s-%03d-%03d' % (stats_file, 1, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) - s = TrialScores(key.model_set, - key.seg_set, - scores, - score_mask=np.logical_or(key.tar, key.non)) + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) s.save_txt(score_file) - logging.info('saving stats to %s' % (stats_file)) + logging.info("saving stats to %s" % (stats_file)) attack_stats.to_csv(stats_file) if __name__ == "__main__": parser = ArgumentParser( - description=('Eval cosine-scoring given enroll x-vector ' - 'and adversarial test wave from ART')) - - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--v-file', dest='v_file', required=True) - parser.add_argument('--key-file', dest='key_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-wav-file', required=True) + description=( + "Eval cosine-scoring given enroll x-vector " + "and adversarial test wave from ART" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--key-file", dest="key_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', - dest='vad_path_prefix', - default=None, - help=('scp file_path prefix for vad')) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - parser.add_argument('--model-path', required=True) + parser.add_argument("--model-path", required=True) parser.add_argument( - '--embed-layer', + "--embed-layer", type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) - parser.add_argument('--use-gpu', - default=False, - action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") - parser.add_argument('--seg-part-idx', - default=1, - type=int, - help=('test part index')) + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) parser.add_argument( - '--num-seg-parts', + "--num-seg-parts", default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) - - parser.add_argument('--save-adv-wav', - default=False, - action='store_true', - help='save adversarial signals to disk') - parser.add_argument('--save-adv-wav-path', - default=None, - help='output path of adv signals') - - parser.add_argument('--stats-file', - default=None, - help='output path of to save stats of adv signals') - - parser.add_argument('--cal-file', - default=None, - help='score calibration file') - parser.add_argument('--threshold', - default=0, - type=float, - help='decision threshold') - parser.add_argument('--max-test-length', - default=None, - type=float, - help=('maximum length (secs) for the test side, ' - 'this is to avoid GPU memory errors')) + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py index eaccf71f..c7bcc50a 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-test-wav.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -31,35 +36,35 @@ def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(device, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() feat_extractor.to(device) return feat_extractor def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.to(device) model.eval() return model def load_calibrator(cal_file, device): - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) calibrator = Calibrator(lr.A[0, 0], lr.b[0]) calibrator.to(device) @@ -89,10 +94,23 @@ def read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts): return ndx, x_e -def eval_cosine_scoring(v_file, ndx_file, enroll_file, test_wav_file, vad_spec, - vad_path_prefix, model_path, embed_layer, score_file, - cal_file, max_test_length, use_gpu, seg_part_idx, - num_seg_parts, **kwargs): +def eval_cosine_scoring( + v_file, + ndx_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + score_file, + cal_file, + max_test_length, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) @@ -102,24 +120,21 @@ def eval_cosine_scoring(v_file, ndx_file, enroll_file, test_wav_file, vad_spec, if cal_file is not None: calibrator = load_calibrator(cal_file, device) - logging.info('loading ndx and enrollment x-vectors') - ndx, y_e = read_data(v_file, ndx_file, enroll_file, seg_part_idx, - num_seg_parts) + logging.info("loading ndx and enrollment x-vectors") + ndx, y_e = read_data(v_file, ndx_file, enroll_file, seg_part_idx, num_seg_parts) audio_args = AR.filter_args(**kwargs) audio_reader = AR(test_wav_file, **audio_args) if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create(vad_spec, - path_prefix=vad_path_prefix, - scp_sep=' ') + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") - scores = np.zeros((ndx.num_models, ndx.num_tests), dtype='float32') + scores = np.zeros((ndx.num_models, ndx.num_tests), dtype="float32") with torch.no_grad(): for j in range(ndx.num_tests): t1 = time.time() - logging.info('scoring test utt %s' % (ndx.seg_set[j])) + logging.info("scoring test utt %s" % (ndx.seg_set[j])) s, fs = audio_reader.read([ndx.seg_set[j]]) s = s[0] fs = fs[0] @@ -130,20 +145,27 @@ def eval_cosine_scoring(v_file, ndx_file, enroll_file, test_wav_file, vad_spec, s = s[:max_samples] t2 = time.time() - s = torch.as_tensor(s[None, :], - dtype=torch.get_default_dtype()).to(device) + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) x_t = feat_extractor(s) t4 = time.time() tot_frames = x_t.shape[1] if vad_spec is not None: - vad = torch.as_tensor(v_reader.read( - [ndx.seg_set[j]], - num_frames=x_t.shape[1])[0].astype(np.uint8, copy=False), - dtype=torch.uint8).to(device) + vad = torch.as_tensor( + v_reader.read([ndx.seg_set[j]], num_frames=x_t.shape[1])[0].astype( + np.uint8, copy=False + ), + dtype=torch.uint8, + ).to(device) x_t = x_t[:, vad] - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % - (ndx.seg_set[j], x_t.shape[1], tot_frames, - x_t.shape[1] / tot_frames * 100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + ndx.seg_set[j], + x_t.shape[1], + tot_frames, + x_t.shape[1] / tot_frames * 100, + ) + ) t5 = time.time() x_t = x_t.transpose(1, 2).contiguous() @@ -153,8 +175,9 @@ def eval_cosine_scoring(v_file, ndx_file, enroll_file, test_wav_file, vad_spec, for i in range(ndx.num_models): if ndx.trial_mask[i, j]: - y_e_i = torch.as_tensor( - y_e[i], dtype=torch.get_default_dtype()).to(device) + y_e_i = torch.as_tensor(y_e[i], dtype=torch.get_default_dtype()).to( + device + ) y_e_i = l2_norm(y_e_i) scores_ij = torch.sum(y_e_i * y_t, dim=-1) if calibrator is None: @@ -166,79 +189,92 @@ def eval_cosine_scoring(v_file, ndx_file, enroll_file, test_wav_file, vad_spec, num_trials = np.sum(ndx.trial_mask[:, j]) trial_time = (t7 - t6) / num_trials logging.info( - ('utt %s total-time=%.3f read-time=%.3f feat-time=%.3f ' - 'vad-time=%.3f embed-time=%.3f trial-time=%.3f n_trials=%d ' - 'rt-factor=%.2f'), ndx.seg_set[j], t7 - t1, t2 - t1, t4 - t2, - t5 - t4, t6 - t5, trial_time, num_trials, - (t7 - t1) / (num_trials * s.shape[1] / fs)) + ( + "utt %s total-time=%.3f read-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + ndx.seg_set[j], + t7 - t1, + t2 - t1, + t4 - t2, + t5 - t4, + t6 - t5, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) if num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, 1, seg_part_idx) - logging.info('saving scores to %s', score_file) - s = TrialScores(ndx.model_set, - ndx.seg_set, - scores, - score_mask=ndx.trial_mask) + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores(ndx.model_set, ndx.seg_set, scores, score_mask=ndx.trial_mask) s.save_txt(score_file) if __name__ == "__main__": parser = ArgumentParser( - description='Eval cosine-scoring given enroll x-vector and test wave') + description="Eval cosine-scoring given enroll x-vector and test wave" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--v-file', dest='v_file', required=True) - parser.add_argument('--ndx-file', dest='ndx_file', default=None) - parser.add_argument('--enroll-file', dest='enroll_file', required=True) - parser.add_argument('--test-wav-file', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", dest="v_file", required=True) + parser.add_argument("--ndx-file", dest="ndx_file", default=None) + parser.add_argument("--enroll-file", dest="enroll_file", required=True) + parser.add_argument("--test-wav-file", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', - dest='vad_path_prefix', - default=None, - help=('scp file_path prefix for vad')) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - parser.add_argument('--model-path', required=True) + parser.add_argument("--model-path", required=True) parser.add_argument( - '--embed-layer', + "--embed-layer", type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) - - parser.add_argument('--use-gpu', - default=False, - action='store_true', - help='extract xvectors in gpu') - - parser.add_argument('--seg-part-idx', - default=1, - type=int, - help=('test part index')) + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) parser.add_argument( - '--num-seg-parts', + "--num-seg-parts", default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--score-file', required=True) - parser.add_argument('--cal-file', default=None) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) - parser.add_argument('--max-test-length', - default=None, - type=float, - help=('maximum length (secs) for the test side, ' - 'this is to avoid GPU memory errors')) + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", required=True) + parser.add_argument("--cal-file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py index f08c0d75..4b08c7ab 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-adv-test-wav.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -34,11 +39,9 @@ class MyModel(nn.Module): - def __init__(self, - feat_extractor, - xvector_model, - embed_layer=None, - calibrator=None): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None + ): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model @@ -61,8 +64,7 @@ def forward(self, s_t): f_t = f_t[:, self.vad_t] f_t = f_t.transpose(1, 2).contiguous() - x_t = self.xvector_model.extract_embed(f_t, - embed_layer=self.embed_layer) + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) score = torch.sum(x_e * x_t, dim=-1) @@ -73,37 +75,37 @@ def forward(self, s_t): def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(**kwargs): feat_args = AF.filter_args(**kwargs) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() return feat_extractor def load_model(model_path): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.freeze() model.eval() return model def load_calibrator(cal_file, threshold): - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) - #subting the threshold here will put the decision threshold in 0 - #some attacks use thr=0 to decide if the attack is succesful + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) calibrator.eval() return calibrator @@ -128,16 +130,34 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): return key, x_e -def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, - vad_path_prefix, transfer_v_file, model_path, - transfer_model_path, embed_layer, score_file, - stats_file, cal_file, transfer_cal_file, threshold, - max_test_length, save_adv_wav, save_adv_wav_path, - use_gpu, seg_part_idx, num_seg_parts, **kwargs): +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + transfer_v_file, + model_path, + transfer_model_path, + embed_layer, + score_file, + stats_file, + cal_file, + transfer_cal_file, + threshold, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): device = init_device(use_gpu) # load victim model - feat_extractor = init_feats(**kwargs['feats']) + feat_extractor = init_feats(**kwargs["feats"]) xvector_model = load_model(model_path) calibrator = None if cal_file is not None: @@ -148,7 +168,7 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, model.eval() # load white-box model - tfeat_extractor = init_feats(**kwargs['transfer_feats']) + tfeat_extractor = init_feats(**kwargs["transfer_feats"]) xvector_tmodel = load_model(transfer_model_path) tcalibrator = None if transfer_cal_file is not None: @@ -161,13 +181,13 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, tar = torch.as_tensor([1], dtype=torch.float).to(device) non = torch.as_tensor([0], dtype=torch.float).to(device) - logging.info('loading key and enrollment x-vectors') - key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) - _, t_x_e = read_data(transfer_v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + _, t_x_e = read_data( + transfer_v_file, key_file, enroll_file, seg_part_idx, num_seg_parts + ) t_x_e = torch.as_tensor(t_x_e, dtype=torch.get_default_dtype()) audio_args = AR.filter_args(**kwargs) @@ -175,35 +195,44 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, wav_scale = audio_reader.wav_scale if save_adv_wav: - tar_audio_writer = AW(save_adv_wav_path + '/tar2non') - non_audio_writer = AW(save_adv_wav_path + '/non2tar') + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") - attack_args = AttackFactory.filter_args(**kwargs['attack']) + attack_args = AttackFactory.filter_args(**kwargs["attack"]) extra_args = { - 'eps_scale': wav_scale, - 'range_min': -wav_scale, - 'range_max': wav_scale, - 'loss': nn.functional.binary_cross_entropy_with_logits, - 'time_dim': 1 + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, } attack_args.update(extra_args) - logging.info('attacks args={}'.format(attack_args)) + logging.info("attacks args={}".format(attack_args)) attack = AttackFactory.create(model, **attack_args) if vad_spec is not None: - logging.info('opening VAD stream: %s', vad_spec) - v_reader = VRF.create(vad_spec, - path_prefix=vad_path_prefix, - scp_sep=' ') - - scores = np.zeros((key.num_models, key.num_tests), dtype='float32') - attack_stats = pd.DataFrame(columns=[ - 'modelid', 'segmentid', 'snr', 'px', 'pn', 'x_l2', 'x_linf', 'n_l0', - 'n_l2', 'n_linf', 'num_frames' - ]) + logging.info("opening VAD stream: %s", vad_spec) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) for j in range(key.num_tests): t1 = time.time() - logging.info('scoring test utt %s', key.seg_set[j]) + logging.info("scoring test utt %s", key.seg_set[j]) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -213,19 +242,23 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, if len(s) > max_samples: s = s[:max_samples] - s = torch.as_tensor(s[None, :], - dtype=torch.get_default_dtype()).to(device) + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool).to(device) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) model.vad_t = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames', - key.seg_set[j], speech_frames, tot_frames, - speech_frames / tot_frames * 100) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames", + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) t2 = time.time() @@ -252,34 +285,33 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, scores[i, j] = model(s_adv) t4 = time.time() - trial_time += (t4 - t3) + trial_time += t4 - t3 num_trials += 1 s_adv = s_adv.detach() stats_ij = compute_stats_adv_attack(s, s_adv) - stats_ij = [ - stat.detach().cpu().numpy()[0] for stat in stats_ij - ] + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] attack_stats = attack_stats.append( { - 'modelid': key.model_set[i], - 'segmentid': key.seg_set[j], - 'snr': stats_ij[0], - 'px': stats_ij[1], - 'pn': stats_ij[2], - 'x_l2': stats_ij[3], - 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], - 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1] + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], }, - ignore_index=True) + ignore_index=True, + ) - #logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) if save_adv_wav: s_adv = s_adv.cpu().numpy()[0] - trial_name = '%s-%s' % (key.model_set[i], key.seg_set[j]) + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) if key.tar[i, j] and scores[i, j] < threshold: tar_audio_writer.write(trial_name, s_adv, fs) elif key.non[i, j] and scores[i, j] > threshold: @@ -287,110 +319,124 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, trial_time /= num_trials t7 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d ' - 'rt-factor=%.2f'), key.seg_set[j], t7 - t1, t2 - t1, trial_time, - num_trials, (t7 - t1) / (num_trials * s.shape[1] / fs)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) if num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, 1, seg_part_idx) - stats_file = '%s-%03d-%03d' % (stats_file, 1, seg_part_idx) - logging.info('saving scores to %s', score_file) - s = TrialScores(key.model_set, - key.seg_set, - scores, - score_mask=np.logical_or(key.tar, key.non)) + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s", score_file) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) s.save_txt(score_file) - logging.info('saving stats to %s', stats_file) + logging.info("saving stats to %s", stats_file) attack_stats.to_csv(stats_file) if __name__ == "__main__": parser = ArgumentParser( - description=('Eval cosine-scoring given enroll x-vector and ' - 'adversarial test wave obtained from a different model')) + description=( + "Eval cosine-scoring given enroll x-vector and " + "adversarial test wave obtained from a different model" + ) + ) - parser.add_argument('--v-file', required=True) - parser.add_argument('--key-file', default=None) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--test-wav-file', required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) - parser.add_argument('--transfer-v-file', required=True) + parser.add_argument("--transfer-v-file", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') - AF.add_class_args(parser, prefix='transfer_feats') + AF.add_class_args(parser, prefix="feats") + AF.add_class_args(parser, prefix="transfer_feats") - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', - dest='vad_path_prefix', - default=None, - help=('scp file_path prefix for vad')) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - parser.add_argument('--model-path', required=True) - parser.add_argument('--transfer-model-path', required=True) + parser.add_argument("--model-path", required=True) + parser.add_argument("--transfer-model-path", required=True) parser.add_argument( - '--embed-layer', + "--embed-layer", type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) - parser.add_argument('--use-gpu', - default=False, - action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") - parser.add_argument('--seg-part-idx', - default=1, - type=int, - help=('test part index')) + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) parser.add_argument( - '--num-seg-parts', + "--num-seg-parts", default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) - - parser.add_argument('--save-adv-wav', - default=False, - action='store_true', - help='save adversarial signals to disk') - - parser.add_argument('--save-adv-wav-path', - default=None, - help='output path of adv signals') - - parser.add_argument('--stats-file', - default=None, - help='output path of to save stats of adv signals') - parser.add_argument('--cal-file', - default=None, - help='score calibration file') - parser.add_argument('--transfer-cal-file', - default=None, - help='score calibration file for transfer model') - parser.add_argument('--threshold', - default=0, - type=float, - help='decision threshold') - parser.add_argument('--max-test-length', - default=None, - type=float, - help=('maximum length (secs) for the test side, ' - 'this is to avoid GPU memory errors')) + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument( + "--transfer-cal-file", + default=None, + help="score calibration file for transfer model", + ) + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py index 4e0b237e..9d9d4666 100755 --- a/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py +++ b/hyperion/bin/torch-eval-xvec-cosine-scoring-from-transfer-art-test-wav.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -32,16 +37,20 @@ from hyperion.torch import TorchModelLoader as TML from art.classifiers import PyTorchClassifier -from hyperion.torch.adv_attacks.art_attack_factory import ARTAttackFactory as AttackFactory +from hyperion.torch.adv_attacks.art_attack_factory import ( + ARTAttackFactory as AttackFactory, +) class MyModel(nn.Module): - def __init__(self, - feat_extractor, - xvector_model, - embed_layer=None, - calibrator=None, - threshold=0): + def __init__( + self, + feat_extractor, + xvector_model, + embed_layer=None, + calibrator=None, + threshold=0, + ): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model @@ -65,8 +74,7 @@ def forward(self, s_t): f_t = f_t[:, self.vad_t] f_t = f_t.transpose(1, 2).contiguous() - x_t = self.xvector_model.extract_embed(f_t, - embed_layer=self.embed_layer) + x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) tar_score = torch.sum(x_e * x_t, dim=-1, keepdim=True) @@ -74,39 +82,39 @@ def forward(self, s_t): score = self.calibrator(tar_score) non_score = self.threshold + 0 * tar_score - score = torch.cat((non_score, tar_score), dim=-1) #.unsqueeze(0) + score = torch.cat((non_score, tar_score), dim=-1) # .unsqueeze(0) return score def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(**kwargs): feat_args = AF.filter_args(**kwargs) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() return feat_extractor def load_model(model_path): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.freeze() model.eval() return model def load_calibrator(cal_file): - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) calibrator = Calibrator(lr.A[0, 0], lr.b[0]) calibrator.eval() @@ -132,55 +140,69 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): return key, x_e -def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, - vad_path_prefix, transfer_v_file, model_path, - transfer_model_path, embed_layer, score_file, - stats_file, cal_file, transfer_cal_file, threshold, - max_test_length, save_adv_wav, save_adv_wav_path, - use_gpu, seg_part_idx, num_seg_parts, **kwargs): - - device_type = 'gpu' if use_gpu else 'cpu' +def eval_cosine_scoring( + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + transfer_v_file, + model_path, + transfer_model_path, + embed_layer, + score_file, + stats_file, + cal_file, + transfer_cal_file, + threshold, + max_test_length, + save_adv_wav, + save_adv_wav_path, + use_gpu, + seg_part_idx, + num_seg_parts, + **kwargs +): + + device_type = "gpu" if use_gpu else "cpu" device = init_device(use_gpu) # load victim model - feat_extractor = init_feats(**kwargs['feats']) + feat_extractor = init_feats(**kwargs["feats"]) xvector_model = load_model(model_path) calibrator = None if cal_file is not None: calibrator = load_calibrator(cal_file) - model = MyModel(feat_extractor, - xvector_model, - embed_layer, - calibrator, - threshold=threshold) + model = MyModel( + feat_extractor, xvector_model, embed_layer, calibrator, threshold=threshold + ) model.to(device) model.eval() # load white-box model - tfeat_extractor = init_feats(**kwargs['transfer_feats']) + tfeat_extractor = init_feats(**kwargs["transfer_feats"]) xvector_tmodel = load_model(transfer_model_path) tcalibrator = None if transfer_cal_file is not None: tcalibrator = load_calibrator(transfer_cal_file) - tmodel = MyModel(tfeat_extractor, - xvector_tmodel, - embed_layer, - tcalibrator, - threshold=threshold) + tmodel = MyModel( + tfeat_extractor, xvector_tmodel, embed_layer, tcalibrator, threshold=threshold + ) tmodel.to(device) tmodel.eval() tar = np.asarray([1], dtype=np.int) non = np.asarray([0], dtype=np.int) - logging.info('loading key and enrollment x-vectors') - key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + logging.info("loading key and enrollment x-vectors") + key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) - _, t_x_e = read_data(transfer_v_file, key_file, enroll_file, seg_part_idx, - num_seg_parts) + _, t_x_e = read_data( + transfer_v_file, key_file, enroll_file, seg_part_idx, num_seg_parts + ) t_x_e = torch.as_tensor(t_x_e, dtype=torch.get_default_dtype()) audio_args = AR.filter_args(**kwargs) @@ -188,28 +210,37 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, wav_scale = audio_reader.wav_scale if save_adv_wav: - tar_audio_writer = AW(save_adv_wav_path + '/tar2non') - non_audio_writer = AW(save_adv_wav_path + '/non2tar') + tar_audio_writer = AW(save_adv_wav_path + "/tar2non") + non_audio_writer = AW(save_adv_wav_path + "/non2tar") - attack_args = AttackFactory.filter_args(**kwargs['attack']) - extra_args = {'eps_scale': wav_scale} + attack_args = AttackFactory.filter_args(**kwargs["attack"]) + extra_args = {"eps_scale": wav_scale} attack_args.update(extra_args) - logging.info('attack-args={}'.format(attack_args)) + logging.info("attack-args={}".format(attack_args)) if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create(vad_spec, - path_prefix=vad_path_prefix, - scp_sep=' ') - - scores = np.zeros((key.num_models, key.num_tests), dtype='float32') - attack_stats = pd.DataFrame(columns=[ - 'modelid', 'segmentid', 'snr', 'px', 'pn', 'x_l2', 'x_linf', 'n_l0', - 'n_l2', 'n_linf', 'num_frames' - ]) + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") + + scores = np.zeros((key.num_models, key.num_tests), dtype="float32") + attack_stats = pd.DataFrame( + columns=[ + "modelid", + "segmentid", + "snr", + "px", + "pn", + "x_l2", + "x_linf", + "n_l0", + "n_l2", + "n_linf", + "num_frames", + ] + ) for j in range(key.num_tests): t1 = time.time() - logging.info('scoring test utt %s' % (key.seg_set[j])) + logging.info("scoring test utt %s" % (key.seg_set[j])) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] @@ -219,35 +250,43 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, if len(s) > max_samples: s = s[:max_samples] - s = s[None, :].astype('float32', copy=False) - s_tensor = torch.as_tensor(s, - dtype=torch.get_default_dtype()).to(device) + s = s[None, :].astype("float32", copy=False) + s_tensor = torch.as_tensor(s, dtype=torch.get_default_dtype()).to(device) if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool).to(device) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) model.vad_t = vad tmodel.vad_t = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % - (key.seg_set[j], speech_frames, tot_frames, - speech_frames / tot_frames * 100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) t2 = time.time() trial_time = 0 num_trials = 0 - model_art = PyTorchClassifier(model=tmodel, - loss=nn.CrossEntropyLoss(), - optimizer=None, - input_shape=[1, s.shape[1]], - nb_classes=2, - clip_values=(-wav_scale, wav_scale), - device_type=device_type) - - attack_args['num_samples'] = s.shape[-1] + model_art = PyTorchClassifier( + model=tmodel, + loss=nn.CrossEntropyLoss(), + optimizer=None, + input_shape=[1, s.shape[1]], + nb_classes=2, + clip_values=(-wav_scale, wav_scale), + device_type=device_type, + ) + + attack_args["num_samples"] = s.shape[-1] attack = AttackFactory.create(model_art, **attack_args) for i in range(key.num_models): if key.tar[i, j] or key.non[i, j]: @@ -271,34 +310,33 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, scores[i, j] = model(s_adv).cpu().numpy()[0, 1] t4 = time.time() - trial_time += (t4 - t3) + trial_time += t4 - t3 num_trials += 1 s_adv = s_adv.detach() stats_ij = compute_stats_adv_attack(s_tensor, s_adv) - stats_ij = [ - stat.detach().cpu().numpy()[0] for stat in stats_ij - ] + stats_ij = [stat.detach().cpu().numpy()[0] for stat in stats_ij] attack_stats = attack_stats.append( { - 'modelid': key.model_set[i], - 'segmentid': key.seg_set[j], - 'snr': stats_ij[0], - 'px': stats_ij[1], - 'pn': stats_ij[2], - 'x_l2': stats_ij[3], - 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], - 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1] + "modelid": key.model_set[i], + "segmentid": key.seg_set[j], + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], }, - ignore_index=True) + ignore_index=True, + ) - #logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) + # logging.info('min-max %f %f %f %f' % (torch.min(s), torch.max(s), torch.min(s_adv-s), torch.max(s_adv-s))) if save_adv_wav: s_adv = s_adv.cpu().numpy()[0] - trial_name = '%s-%s' % (key.model_set[i], key.seg_set[j]) + trial_name = "%s-%s" % (key.model_set[i], key.seg_set[j]) if key.tar[i, j] and scores[i, j] < threshold: tar_audio_writer.write(trial_name, s_adv, fs) elif key.non[i, j] and scores[i, j] > threshold: @@ -308,109 +346,123 @@ def eval_cosine_scoring(v_file, key_file, enroll_file, test_wav_file, vad_spec, del model_art trial_time /= num_trials t7 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d ' - 'rt-factor=%.2f'), key.seg_set[j], t7 - t1, t2 - t1, trial_time, - num_trials, (t7 - t1) / (num_trials * s.shape[1] / fs)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.2f" + ), + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + (t7 - t1) / (num_trials * s.shape[1] / fs), + ) if num_seg_parts > 1: - score_file = '%s-%03d-%03d' % (score_file, 1, seg_part_idx) - stats_file = '%s-%03d-%03d' % (stats_file, 1, seg_part_idx) - logging.info('saving scores to %s' % (score_file)) - s = TrialScores(key.model_set, - key.seg_set, - scores, - score_mask=np.logical_or(key.tar, key.non)) + score_file = "%s-%03d-%03d" % (score_file, 1, seg_part_idx) + stats_file = "%s-%03d-%03d" % (stats_file, 1, seg_part_idx) + logging.info("saving scores to %s" % (score_file)) + s = TrialScores( + key.model_set, key.seg_set, scores, score_mask=np.logical_or(key.tar, key.non) + ) s.save_txt(score_file) - logging.info('saving stats to %s' % (stats_file)) + logging.info("saving stats to %s" % (stats_file)) attack_stats.to_csv(stats_file) if __name__ == "__main__": parser = ArgumentParser( - description=('Eval cosine-scoring given enroll x-vector and ' - 'adversarial test wave obtained from a different model' - 'using ART')) + description=( + "Eval cosine-scoring given enroll x-vector and " + "adversarial test wave obtained from a different model" + "using ART" + ) + ) - parser.add_argument('--v-file', required=True) - parser.add_argument('--key-file', default=None) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--test-wav-file', required=True) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) - parser.add_argument('--transfer-v-file', required=True) + parser.add_argument("--transfer-v-file", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') - AF.add_class_args(parser, prefix='transfer_feats') + AF.add_class_args(parser, prefix="feats") + AF.add_class_args(parser, prefix="transfer_feats") - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', - dest='vad_path_prefix', - default=None, - help=('scp file_path prefix for vad')) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) - parser.add_argument('--model-path', required=True) - parser.add_argument('--transfer-model-path', required=True) + parser.add_argument("--model-path", required=True) + parser.add_argument("--transfer-model-path", required=True) parser.add_argument( - '--embed-layer', + "--embed-layer", type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) - parser.add_argument('--use-gpu', - default=False, - action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) - AttackFactory.add_class_args(parser, prefix='attack') + AttackFactory.add_class_args(parser, prefix="attack") - parser.add_argument('--seg-part-idx', - default=1, - type=int, - help=('test part index')) + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) parser.add_argument( - '--num-seg-parts', + "--num-seg-parts", default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('-v', - '--verbose', - dest='verbose', - default=1, - choices=[0, 1, 2, 3], - type=int) - - parser.add_argument('--save-adv-wav', - default=False, - action='store_true', - help='save adversarial signals to disk') - parser.add_argument('--save-adv-wav-path', - default=None, - help='output path of adv signals') - parser.add_argument('--stats-file', - default=None, - help='output path of to save stats of adv signals') - parser.add_argument('--cal-file', - default=None, - help='score calibration file') - parser.add_argument('--transfer-cal-file', - default=None, - help='score calibration file for transfer model') - parser.add_argument('--threshold', - default=0, - type=float, - help='decision threshold') - parser.add_argument('--max-test-length', - default=None, - type=float, - help=('maximum length (secs) for the test side, ' - 'this is to avoid GPU memory errors')) + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument( + "--save-adv-wav", + default=False, + action="store_true", + help="save adversarial signals to disk", + ) + parser.add_argument( + "--save-adv-wav-path", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--stats-file", default=None, help="output path of to save stats of adv signals" + ) + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument( + "--transfer-cal-file", + default=None, + help="score calibration file for transfer model", + ) + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + parser.add_argument( + "--max-test-length", + default=None, + type=float, + help=( + "maximum length (secs) for the test side, " + "this is to avoid GPU memory errors" + ), + ) args = parser.parse_args() config_logger(args.verbose) diff --git a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py b/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py index 0bd70a12..bf227045 100755 --- a/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py +++ b/hyperion/bin/torch-extract-xvectors-from-wav-with-rttm.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -26,29 +31,30 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(device, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() feat_extractor.to(device) return feat_extractor def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.to(device) model.eval() return model @@ -60,40 +66,53 @@ def augment(key0, x0, augmenter, aug_df, aug_id): key = key0 else: x, aug_info = augmenter(x0) - key = '%s-aug-%02d' % (key0, aug_id) - aug_df_row = {'key_aug': key, 'key_orig': key0, - 'noise_type': aug_info['noise']['noise_type'], - 'snr': aug_info['noise']['snr'], - 'rir_type': aug_info['reverb']['rir_type'], - 'srr': aug_info['reverb']['srr'], - 'sdr': aug_info['sdr']} - + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - + return key, x def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint( - low=min_utt_length, high=max_utt_length+1) + utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint( - low=0, high=x.shape[1]-utt_length) - x = x[:,first_frame:first_frame+utt_length] + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] logging.info( - 'extract-random-utt %s of length=%d first-frame=%d' % ( - key, x.shape[1], first_frame)) + "extract-random-utt %s of length=%d first-frame=%d" + % (key, x.shape[1], first_frame) + ) return x -def extract_xvectors(input_spec, output_spec, rttm_file, - scp_sep, - model_path, chunk_length, embed_layer, - random_utt_length, min_utt_length, max_utt_length, - aug_cfg, num_augs, aug_info_path, - use_gpu, **kwargs): - - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) +def extract_xvectors( + input_spec, + output_spec, + rttm_file, + scp_sep, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -108,10 +127,12 @@ def extract_xvectors(input_spec, output_spec, rttm_file, min_samples = int(feat_extractor.fs * feat_extractor.frame_length / 1000) ar_args = AR.filter_args(**kwargs) - logging.info('opening output stream: %s' % (output_spec)) + logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec, scp_sep=scp_sep) as writer: - logging.info('opening input stream: {} with args={}'.format(input_spec, ar_args)) + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) with AR(input_spec, **ar_args) as reader: rttm = RTTM.load(rttm_file) rttm = rttm.filter(reader.scp.key) @@ -127,7 +148,7 @@ def extract_xvectors(input_spec, output_spec, rttm_file, spk_names = rttm.get_uniq_names_for_file(key0) num_spks = len(spk_names) - logging.info('processing utt %s num-spks=%d' % (key0, num_spks)) + logging.info("processing utt %s num-spks=%d" % (key0, num_spks)) for aug_id in range(num_augs): t3 = time.time() @@ -150,40 +171,54 @@ def extract_xvectors(input_spec, output_spec, rttm_file, x_total = x max_samples = x.shape[0] - y = np.zeros((num_spks, model.embed_dim,), dtype=float_cpu()) + y = np.zeros( + ( + num_spks, + model.embed_dim, + ), + dtype=float_cpu(), + ) val_spks = np.ones((num_spks,), dtype=np.bool) for spk_id in range(num_spks): t4 = time.time() spk_name = spk_names[spk_id] mask = rttm.get_bin_sample_mask_for_spk( - key0, spk_name, feat_extractor.fs, - max_samples=max_samples) + key0, spk_name, feat_extractor.fs, max_samples=max_samples + ) x = x_total[mask] num_speech_samples = x.shape[0] logging.info( - 'utt %s spk-name %s %d/%d (%.2f %%) speech samples' % ( - key, spk_name, num_speech_samples, max_samples, - num_speech_samples/max_samples*100)) - + "utt %s spk-name %s %d/%d (%.2f %%) speech samples" + % ( + key, + spk_name, + num_speech_samples, + max_samples, + num_speech_samples / max_samples * 100, + ) + ) + if num_speech_samples < min_samples: val_spks[spk_id] = False logging.info( - 'utt %s spk-name %s %d < %d speech samples, skipping' % ( - key, spk_name, num_speech_samples, min_samples)) + "utt %s spk-name %s %d < %d speech samples, skipping" + % (key, spk_name, num_speech_samples, min_samples) + ) continue - with torch.no_grad(): + with torch.no_grad(): x = torch.tensor( - x[None,:], dtype=torch.get_default_dtype()).to( - device) + x[None, :], dtype=torch.get_default_dtype() + ).to(device) x = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if random_utt_length: x = select_random_chunk( - key, x, min_utt_length, max_utt_length, rng) - + key, x, min_utt_length, max_utt_length, rng + ) + # if random_utt_length: # utt_length = rng.randint( # low=min_utt_length, high=max_utt_length+1) @@ -197,81 +232,122 @@ def extract_xvectors(input_spec, output_spec, rttm_file, t6 = time.time() if x.shape[1] > 0: - x = x.transpose(1,2).contiguous() - y_i = model.extract_embed( - x, chunk_length=chunk_length, - embed_layer=embed_layer).cpu().numpy()[0] - y[spk_id,:] = y_i + x = x.transpose(1, 2).contiguous() + y_i = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) + y[spk_id, :] = y_i t7 = time.time() - tot_time = t7-t4 - logging.info(( - 'utt %s spk=%s total-time=%.3f feat-time=%.3f ' - 'embed-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key, spk_name, tot_time, t5-t4, - t6-t5, t7-t6, num_speech_samples/fs[0]/tot_time)) + tot_time = t7 - t4 + logging.info( + ( + "utt %s spk=%s total-time=%.3f feat-time=%.3f " + "embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + spk_name, + tot_time, + t5 - t4, + t6 - t5, + t7 - t6, + num_speech_samples / fs[0] / tot_time, + ) + ) if not np.any(val_spks): - y = y[:1] #if none are valid spks, we keep a 1xdim 0 vector + y = y[:1] # if none are valid spks, we keep a 1xdim 0 vector else: - y = y[val_spks] # we keep speakers with at least 1 frame - - writer.write([key], [y]) + y = y[val_spks] # we keep speakers with at least 1 frame + writer.write([key], [y]) if aug_info_path is not None: aug_df = pd.concat(aug_df, ignore_index=True) - aug_df.to_csv(aug_info_path, index=False, na_rep='n/a') - + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + if __name__ == "__main__": - + parser = ArgumentParser( - description=('Extract x-vectors from waveform computing ' - 'acoustic features on the fly')) + description=( + "Extract x-vectors from waveform computing " "acoustic features on the fly" + ) + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) - parser.add_argument('--scp-sep', default=' ', - help=('scp file field separator')) - parser.add_argument('--rttm-file', required=True, - help=('RTTM file path')) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument("--rttm-file", required=True, help=("RTTM file path")) AR.add_class_args(parser) - parser.add_argument('--aug-cfg', default=None) - parser.add_argument('--aug-info-path', default=None) - parser.add_argument('--num-augs', default=1, type=int, - help='number of augmentations per utterance') - - AF.add_class_args(parser, prefix='feats') - - parser.add_argument('--model-path', required=True) - parser.add_argument('--chunk-length', type=int, default=0, - help=('number of frames used in each forward pass ' - 'of the x-vector encoder,' - 'if 0 the full utterance is used')) - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from, ' - 'if None, it uses layer set in training phase')) - - parser.add_argument('--random-utt-length', default=False, action='store_true', - help='calculates x-vector from a random chunk') - parser.add_argument('--min-utt-length', type=int, default=500, - help=('minimum utterance length when using random utt length')) - parser.add_argument('--max-utt-length', type=int, default=12000, - help=('maximum utterance length when using random utt length')) - - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) extract_xvectors(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-extract-xvectors-from-wav.py b/hyperion/bin/torch-extract-xvectors-from-wav.py index ebbff641..0aea084e 100755 --- a/hyperion/bin/torch-extract-xvectors-from-wav.py +++ b/hyperion/bin/torch-extract-xvectors-from-wav.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -26,29 +31,30 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(device, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() feat_extractor.to(device) return feat_extractor def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.to(device) model.eval() return model @@ -60,40 +66,55 @@ def augment(key0, x0, augmenter, aug_df, aug_id): key = key0 else: x, aug_info = augmenter(x0) - key = '%s-aug-%02d' % (key0, aug_id) - aug_df_row = {'key_aug': key, 'key_orig': key0, - 'noise_type': aug_info['noise']['noise_type'], - 'snr': aug_info['noise']['snr'], - 'rir_type': aug_info['reverb']['rir_type'], - 'srr': aug_info['reverb']['srr'], - 'sdr': aug_info['sdr']} - + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - + return key, x def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint( - low=min_utt_length, high=max_utt_length+1) + utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint( - low=0, high=x.shape[1]-utt_length) - x = x[:,first_frame:first_frame+utt_length] + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] logging.info( - 'extract-random-utt %s of length=%d first-frame=%d' % ( - key, x.shape[1], first_frame)) + "extract-random-utt %s of length=%d first-frame=%d" + % (key, x.shape[1], first_frame) + ) return x -def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, - scp_sep, vad_path_prefix, - model_path, chunk_length, embed_layer, - random_utt_length, min_utt_length, max_utt_length, - aug_cfg, num_augs, aug_info_path, - use_gpu, **kwargs): - - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + scp_sep, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) @@ -111,18 +132,20 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, num_augs = 1 ar_args = AR.filter_args(**kwargs) - logging.info('opening output stream: %s' % (output_spec)) + logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec, scp_sep=scp_sep) as writer: - logging.info('opening input stream: {} with args={}'.format( - input_spec, ar_args)) + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) with AR(input_spec, **ar_args) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, - scp_sep=scp_sep) - + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create( + vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + ) + while not reader.eof(): t1 = time.time() key, x0, fs = reader.read(1) @@ -133,42 +156,53 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, key0 = key[0] t2 = time.time() - logging.info('processing utt %s' % (key0)) + logging.info("processing utt %s" % (key0)) for aug_id in range(num_augs): t3 = time.time() key, x = augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() with torch.no_grad(): x = torch.tensor( - x[None,:], dtype=torch.get_default_dtype()).to( - device) + x[None, :], dtype=torch.get_default_dtype() + ).to(device) x = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: - vad = v_reader.read( - key0, num_frames=tot_frames)[0] + vad = v_reader.read(key0, num_frames=tot_frames)[0] vad = torch.tensor(vad, dtype=torch.bool).to(device) - x = x[:,vad] + x = x[:, vad] logging.info( - 'utt %s detected %d/%d (%.2f %%) speech frames' % ( - key, x.shape[1], tot_frames, - x.shape[1]/tot_frames*100)) - + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + ) + if random_utt_length: x = select_random_chunk( - key, x, min_utt_length, max_utt_length, rng) + key, x, min_utt_length, max_utt_length, rng + ) t6 = time.time() if x.shape[1] == 0: y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: - x = x.transpose(1,2).contiguous() - y = model.extract_embed( - x, chunk_length=chunk_length, - embed_layer=embed_layer).cpu().numpy()[0] + x = x.transpose(1, 2).contiguous() + y = ( + model.extract_embed( + x, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .cpu() + .numpy()[0] + ) t7 = time.time() writer.write([key], [y]) @@ -178,76 +212,117 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, t8 = time.time() read_time = t2 - t1 - tot_time = read_time + t8-t3 - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f ' - 'aug-time=%.3f feat-time=%.3f ' - 'vad-time=%.3f embed-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key, tot_time, read_time, t4-t3, t5-t4, - t6-t5, t7-t6, t8-t7, x0.shape[0]/fs[0]/tot_time)) + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) if write_num_frames_spec is not None: - logging.info('writing num-frames to %s' % (write_num_frames_spec)) + logging.info("writing num-frames to %s" % (write_num_frames_spec)) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) if aug_info_path is not None: aug_df = pd.concat(aug_df, ignore_index=True) - aug_df.to_csv(aug_info_path, index=False, na_rep='n/a') - + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") + if __name__ == "__main__": - - parser=ArgumentParser( - description=('Extracts x-vectors from waveform computing ' - 'acoustic features on the fly')) - - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-num-frames', dest='write_num_frames_spec', - default=None) - parser.add_argument('--scp-sep', default=' ', - help=('scp file field separator')) - parser.add_argument('--vad-path-prefix', default=None, - help=('scp file_path prefix for vad')) + + parser = ArgumentParser( + description=( + "Extracts x-vectors from waveform computing " "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) AR.add_class_args(parser) - parser.add_argument('--aug-cfg', default=None) - parser.add_argument('--aug-info-path', default=None) - parser.add_argument('--num-augs', default=1, type=int, - help='number of augmentations per utterance') - - AF.add_class_args(parser, prefix='feats') - - parser.add_argument('--model-path', required=True) - parser.add_argument('--chunk-length', type=int, default=0, - help=('number of frames used in each forward pass ' - 'of the x-vector encoder,' - 'if 0 the full utterance is used')) - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from, ' - 'if None, it uses layer set in training phase')) - - parser.add_argument('--random-utt-length', default=False, action='store_true', - help='calculates x-vector from a random chunk') - parser.add_argument('--min-utt-length', type=int, default=500, - help=('minimum utterance length when using random utt length')) - parser.add_argument('--max-utt-length', type=int, default=12000, - help=('maximum utterance length when using random utt length')) - - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) extract_xvectors(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py b/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py index 6e1e1a3c..e3ab70e9 100755 --- a/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py +++ b/hyperion/bin/torch-extract-xvectors-slidwin-from-wav.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -27,29 +32,30 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_feats(device, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) feat_extractor.eval() feat_extractor.to(device) return feat_extractor def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.to(device) model.eval() return model @@ -61,49 +67,64 @@ def augment(key0, x0, augmenter, aug_df, aug_id): key = key0 else: x, aug_info = augmenter(x0) - key = '%s-aug-%02d' % (key0, aug_id) - aug_df_row = {'key_aug': key, 'key_orig': key0, - 'noise_type': aug_info['noise']['noise_type'], - 'snr': aug_info['noise']['snr'], - 'rir_type': aug_info['reverb']['rir_type'], - 'srr': aug_info['reverb']['srr'], - 'sdr': aug_info['sdr']} - + key = "%s-aug-%02d" % (key0, aug_id) + aug_df_row = { + "key_aug": key, + "key_orig": key0, + "noise_type": aug_info["noise"]["noise_type"], + "snr": aug_info["noise"]["snr"], + "rir_type": aug_info["reverb"]["rir_type"], + "srr": aug_info["reverb"]["srr"], + "sdr": aug_info["sdr"], + } + aug_df.append(pd.DataFrame(aug_df_row, index=[0])) - + return key, x def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint( - low=min_utt_length, high=max_utt_length+1) + utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint( - low=0, high=x.shape[1]-utt_length) - x = x[:,first_frame:first_frame+utt_length] + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] logging.info( - 'extract-random-utt %s of length=%d first-frame=%d' % ( - key, x.shape[1], first_frame)) + "extract-random-utt %s of length=%d first-frame=%d" + % (key, x.shape[1], first_frame) + ) return x -def extract_xvectors(input_spec, output_spec, vad_spec, - write_timestamps_spec, slidwin_params_path, - scp_sep, vad_path_prefix, - model_path, chunk_length, embed_layer, - win_length, win_shift, snip_edges, - aug_cfg, num_augs, aug_info_path, - use_gpu, **kwargs): - - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_timestamps_spec, + slidwin_params_path, + scp_sep, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + win_length, + win_shift, + snip_edges, + aug_cfg, + num_augs, + aug_info_path, + use_gpu, + **kwargs +): + + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) feat_extractor = init_feats(device, **kwargs) model = load_model(model_path, device) - feat_args = kwargs['feats']['audio_feats'] - feat_frame_length=feat_args['frame_length'] - feat_frame_shift=feat_args['frame_shift'] - feat_snip_edges=feat_args['snip_edges'] + feat_args = kwargs["feats"]["audio_feats"] + feat_frame_length = feat_args["frame_length"] + feat_frame_shift = feat_args["frame_shift"] + feat_snip_edges = feat_args["snip_edges"] if write_timestamps_spec is not None: time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep) @@ -117,17 +138,20 @@ def extract_xvectors(input_spec, output_spec, vad_spec, num_augs = 1 ar_args = AR.filter_args(**kwargs) - logging.info('opening output stream: %s' % (output_spec)) + logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec, scp_sep=scp_sep) as writer: - logging.info('opening input stream: {} with args={}'.format(input_spec, ar_args)) + logging.info( + "opening input stream: {} with args={}".format(input_spec, ar_args) + ) with AR(input_spec, **ar_args) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, - scp_sep=scp_sep) - + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create( + vad_spec, path_prefix=vad_path_prefix, scp_sep=scp_sep + ) + while not reader.eof(): t1 = time.time() key, x0, fs = reader.read(1) @@ -138,40 +162,61 @@ def extract_xvectors(input_spec, output_spec, vad_spec, key0 = key[0] t2 = time.time() - logging.info('processing utt %s' % (key0)) + logging.info("processing utt %s" % (key0)) for aug_id in range(num_augs): t3 = time.time() key, x = augment(key0, x0, augmenter, aug_df, aug_id) t4 = time.time() with torch.no_grad(): x = torch.tensor( - x[None,:], dtype=torch.get_default_dtype()).to( - device) + x[None, :], dtype=torch.get_default_dtype() + ).to(device) x = feat_extractor(x) t5 = time.time() tot_frames = x.shape[1] if vad_spec is not None: - vad = v_reader.read( - key0, num_frames=tot_frames)[0] + vad = v_reader.read(key0, num_frames=tot_frames)[0] vad = torch.tensor(vad, dtype=torch.bool).to(device) - x = x[:,vad] + x = x[:, vad] logging.info( - 'utt %s detected %d/%d (%.2f %%) speech frames' % ( - key, x.shape[1], tot_frames, - x.shape[1]/tot_frames*100)) + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key, + x.shape[1], + tot_frames, + x.shape[1] / tot_frames * 100, + ) + ) t6 = time.time() if x.shape[1] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu()) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: - x = x.transpose(1,2).contiguous() - y = model.extract_embed_slidwin( - x, win_length, win_shift, snip_edges=snip_edges, - feat_frame_length=feat_frame_length, feat_frame_shift=feat_frame_shift, - chunk_length=chunk_length, - embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] + x = x.transpose(1, 2).contiguous() + y = ( + model.extract_embed_slidwin( + x, + win_length, + win_shift, + snip_edges=snip_edges, + feat_frame_length=feat_frame_length, + feat_frame_shift=feat_frame_shift, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=True, + ) + .detach() + .cpu() + .numpy()[0] + ) t7 = time.time() y = y.T @@ -180,109 +225,167 @@ def extract_xvectors(input_spec, output_spec, vad_spec, if write_timestamps_spec is not None: num_wins = y.shape[0] timestamps = model.compute_slidwin_timestamps( - num_wins, win_length, win_shift, snip_edges, - feat_frame_length, feat_frame_length, feat_snip_edges).numpy() - logging.info('{}'.format(timestamps)) + num_wins, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ).numpy() + logging.info("{}".format(timestamps)) time_writer.write([key], [timestamps]) t8 = time.time() read_time = t2 - t1 - tot_time = read_time + t8-t3 - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f ' - 'aug-time=%.3f feat-time=%.3f ' - 'vad-time=%.3f embed-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key, tot_time, read_time, t4-t3, t5-t4, - t6-t5, t7-t6, t8-t7, x0.shape[0]/fs[0]/tot_time)) + tot_time = read_time + t8 - t3 + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "aug-time=%.3f feat-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key, + tot_time, + read_time, + t4 - t3, + t5 - t4, + t6 - t5, + t7 - t6, + t8 - t7, + x0.shape[0] / fs[0] / tot_time, + ) + ) if write_timestamps_spec is not None: time_writer.close() if aug_info_path is not None: aug_df = pd.concat(aug_df, ignore_index=True) - aug_df.to_csv(aug_info_path, index=False, na_rep='n/a') + aug_df.to_csv(aug_info_path, index=False, na_rep="n/a") if slidwin_params_path is not None: - params = {'padding': model.compute_slidwin_left_padding( - win_length, win_shift, snip_edges, - feat_frame_length, feat_frame_length, feat_snip_edges), - 'win_length': win_length, - 'win_shift': win_shift} - with open(slidwin_params_path, 'w') as f: + params = { + "padding": model.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ), + "win_length": win_length, + "win_shift": win_shift, + } + with open(slidwin_params_path, "w") as f: yaml.dump(params, f) if __name__ == "__main__": - - parser = ArgumentParser( - description=('Extract x-vectors over a sliding window' - 'from waveform computing ' - 'acoustic features on the fly')) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-timestamps', dest='write_timestamps_spec', default=None) - parser.add_argument('--slidwin-params-path', default=None) - - parser.add_argument('--scp-sep', default=' ', - help=('scp file field separator')) - parser.add_argument('--vad-path-prefix', default=None, - help=('scp file_path prefix for vad')) + parser = ArgumentParser( + description=( + "Extract x-vectors over a sliding window" + "from waveform computing " + "acoustic features on the fly" + ) + ) + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-timestamps", dest="write_timestamps_spec", default=None + ) + parser.add_argument("--slidwin-params-path", default=None) + + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) AR.add_argparse_args(parser) - parser.add_argument('--aug-cfg', default=None) - parser.add_argument('--aug-info-path', default=None) - parser.add_argument('--num-augs', default=1, type=int, - help='number of augmentations per utterance') - - AF.add_class_args(parser, prefix='feats') - - parser.add_argument('--model-path', required=True) - parser.add_argument('--win-length', type=float, default=1.5, - help=('window length for x-vector extraction in seconds')) - parser.add_argument('--win-shift', type=float, default=0.25, - help=('window shift for x-vector extraction in seconds')) - parser.add_argument('--snip-edges', default=False, action='store_true', - help=('If true, end effects will be handled by outputting ' - 'only windows that completely fit in the file, ' - 'and the number of windows depends on the window-length. ' - 'If false, the number of windows depends only on ' - 'the window-shift, and we reflect the data at the ends.')) - - parser.add_argument('--chunk-length', type=int, default=0, - help=('number of frames used in each forward pass ' - 'of the x-vector encoder,' - 'if 0 the full utterance is used')) - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from, ' - 'if None, it uses layer set in training phase')) + parser.add_argument("--aug-cfg", default=None) + parser.add_argument("--aug-info-path", default=None) + parser.add_argument( + "--num-augs", default=1, type=int, help="number of augmentations per utterance" + ) + + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--win-length", + type=float, + default=1.5, + help=("window length for x-vector extraction in seconds"), + ) + parser.add_argument( + "--win-shift", + type=float, + default=0.25, + help=("window shift for x-vector extraction in seconds"), + ) + parser.add_argument( + "--snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting " + "only windows that completely fit in the file, " + "and the number of windows depends on the window-length. " + "If false, the number of windows depends only on " + "the window-shift, and we reflect the data at the ends." + ), + ) + + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass " + "of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from, " + "if None, it uses layer set in training phase" + ), + ) # parser.add_argument('--random-utt-length', default=False, action='store_true', # help='calculates x-vector from a random chunk') - # parser.add_argument('--min-utt-length', type=int, default=500, + # parser.add_argument('--min-utt-length', type=int, default=500, # help=('minimum utterance length when using random utt length')) - # parser.add_argument('--max-utt-length', type=int, default=12000, + # parser.add_argument('--max-utt-length', type=int, default=12000, # help=('maximum utterance length when using random utt length')) - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) # parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, # help=('splits the list of files in num-parts ' # 'and process part_idx')) # parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, # help=('splits the list of files in num-parts ' # 'and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) extract_xvectors(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-extract-xvectors-slidwin.py b/hyperion/bin/torch-extract-xvectors-slidwin.py index 2a2c33fb..0e2f0173 100755 --- a/hyperion/bin/torch-extract-xvectors-slidwin.py +++ b/hyperion/bin/torch-extract-xvectors-slidwin.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -24,17 +29,18 @@ from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_mvn(device, **kwargs): - mvn_args = MVN.filter_args(**kwargs['mvn']) - logging.info('mvn args={}'.format(mvn_args)) + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) mvn = MVN(**mvn_args) if mvn.norm_mean or mvn.norm_var: return mvn @@ -42,73 +48,104 @@ def init_mvn(device, **kwargs): def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.to(device) model.eval() return model -def extract_xvectors(input_spec, output_spec, vad_spec, - write_timestamps_spec, slidwin_params_path, - vad_path_prefix, - model_path, chunk_length, embed_layer, - win_length, win_shift, snip_edges, - feat_frame_length, feat_frame_shift, feat_snip_edges, - use_gpu, **kwargs): - - logging.info('initializing') - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_timestamps_spec, + slidwin_params_path, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_shift, + feat_snip_edges, + use_gpu, + **kwargs +): + + logging.info("initializing") + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) - + if write_timestamps_spec is not None: time_writer = DWF.create(write_timestamps_spec, scp_sep=scp_sep) - + dr_args = DRF.filter_args(**kwargs) - logging.info('opening output stream: %s' % (output_spec)) + logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info('opening input stream: %s' % (output_spec)) + logging.info("opening input stream: %s" % (output_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) + logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - + while not reader.eof(): t1 = time.time() key, data = reader.read(1) if len(key) == 0: break t2 = time.time() - logging.info('processing utt %s' % (key[0])) + logging.info("processing utt %s" % (key[0])) x = data[0] if mvn is not None: x = mvn.normalize(x) t3 = time.time() tot_frames = x.shape[0] if vad_spec is not None: - vad = v_reader.read( - key, num_frames=x.shape[0])[0].astype( - 'bool', copy=False) + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) x = x[vad] - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % ( - key[0], x.shape[0], tot_frames, x.shape[0]/tot_frames*100)) - + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + t4 = time.time() if x.shape[0] == 0: - y = np.zeros((1, model.embed_dim,), dtype=float_cpu()) + y = np.zeros( + ( + 1, + model.embed_dim, + ), + dtype=float_cpu(), + ) else: - xx = torch.tensor(x.T[None,:], dtype=torch.get_default_dtype()) + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): - y = model.extract_embed_slidwin( - xx, win_length, win_shift, snip_edges=snip_edges, - feat_frame_length=feat_frame_length, feat_frame_shift=feat_frame_shift, - chunk_length=chunk_length, - embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] + y = ( + model.extract_embed_slidwin( + xx, + win_length, + win_shift, + snip_edges=snip_edges, + feat_frame_length=feat_frame_length, + feat_frame_shift=feat_frame_shift, + chunk_length=chunk_length, + embed_layer=embed_layer, + detach_chunks=True, + ) + .detach() + .cpu() + .numpy()[0] + ) # if np.any(np.isnan(y)): # y = y.T @@ -118,42 +155,41 @@ def extract_xvectors(input_spec, output_spec, vad_spec, # raise Exception() # y1 = model.extract_embed( # xx[:,:,:148], - # chunk_length=chunk_length, + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y.shape, y1.shape)) # logging.info('{} {}'.format(y[:20, 0], y1[:20])) # y2 = model.extract_embed( - # xx[:,:,25:173], - # chunk_length=chunk_length, + # xx[:,:,25:173], + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 1], y2[:20])) # y3 = model.extract_embed( - # xx[:,:,250:398], - # chunk_length=chunk_length, + # xx[:,:,250:398], + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 10], y3[:20])) - # win_length = 20 # y = model.extract_embed_slidwin( # xx, win_length, win_shift, snip_edges=True, # feat_frame_length=feat_frame_length, feat_frame_shift=feat_frame_shift, - # chunk_length=chunk_length, + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # y1 = model.extract_embed( - # xx[:,:,:1999], - # chunk_length=chunk_length, + # xx[:,:,:1999], + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y.shape, y1.shape)) # logging.info('{} {}'.format(y[:20, 0], y1[:20])) # y2 = model.extract_embed( - # xx[:,:,25:2024], - # chunk_length=chunk_length, + # xx[:,:,25:2024], + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 1], y2[:20])) # y3 = model.extract_embed( - # xx[:,:,250:2249], + # xx[:,:,250:2249], # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 10], y3[:20])) @@ -162,39 +198,38 @@ def extract_xvectors(input_spec, output_spec, vad_spec, # y = model.extract_embed_slidwin( # xx, win_length, win_shift, snip_edges=False, # feat_frame_length=feat_frame_length, feat_frame_shift=feat_frame_shift, - # chunk_length=chunk_length, + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # y1 = model.extract_embed( - # xx[:,:,:1112], - # chunk_length=chunk_length, + # xx[:,:,:1112], + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y.shape, y1.shape)) # logging.info('{} {}'.format(y[:20, 0], y1[:20])) # y2 = model.extract_embed( - # xx[:,:,25:1037], - # chunk_length=chunk_length, + # xx[:,:,25:1037], + # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 1], y2[:20])) # y3 = model.extract_embed( - # xx[:,:,250:1262], + # xx[:,:,250:1262], # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 10], y3[:20])) # y3 = model.extract_embed( - # xx[:,:,250:1262], + # xx[:,:,250:1262], # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 10], y3[:20])) # y3 = model.extract_embed( - # xx[:,:,2500:3512], + # xx[:,:,2500:3512], # chunk_length=chunk_length, # embed_layer=embed_layer, detach_chunks=True).detach().cpu().numpy()[0] # logging.info('{} {}'.format(y[:20, 100], y3[:20])) - t5 = time.time() y = y.T writer.write(key, [y]) @@ -202,96 +237,167 @@ def extract_xvectors(input_spec, output_spec, vad_spec, if write_timestamps_spec is not None: num_wins = y.shape[0] timestamps = model.compute_slidwin_timestamps( - num_wins, win_length, win_shift, snip_edges, - feat_frame_length, feat_frame_length, feat_snip_edges).numpy() - logging.info('{}'.format(timestamps)) + num_wins, + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ).numpy() + logging.info("{}".format(timestamps)) time_writer.write(key, [timestamps]) t6 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f ' - 'vad-time=%.3f embed-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key[0], t6-t1, t2-t1, t3-t2, t4-t3, - t5-t4, t6-t5, x.shape[0]*1e-2/(t6-t1))) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) if write_timestamps_spec is not None: time_writer.close() if slidwin_params_path is not None: - params = {'padding': model.compute_slidwin_left_padding( - win_length, win_shift, snip_edges, - feat_frame_length, feat_frame_length, feat_snip_edges), - 'win_length': win_length, - 'win_shift': win_shift} - with open(slidwin_params_path, 'w') as f: + params = { + "padding": model.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_length, + feat_snip_edges, + ), + "win_length": win_length, + "win_shift": win_shift, + } + with open(slidwin_params_path, "w") as f: yaml.dump(params, f) - - + + if __name__ == "__main__": - - parser = ArgumentParser( - description='Extract x-vectors over a sliding window') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) + parser = ArgumentParser(description="Extract x-vectors over a sliding window") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) DRF.add_class_args(parser) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-timestamps', dest='write_timestamps_spec', default=None) - parser.add_argument('--slidwin-params-path', default=None) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-timestamps", dest="write_timestamps_spec", default=None + ) + parser.add_argument("--slidwin-params-path", default=None) # parser.add_argument('--scp-sep', dest='scp_sep', default=' ', # help=('scp file field separator')) # parser.add_argument('--path-prefix', dest='path_prefix', default=None, # help=('scp file_path prefix')) - parser.add_argument('--vad-path-prefix', dest='vad_path_prefix', default=None, - help=('scp file_path prefix for vad')) - - MVN.add_class_args(parser, prefix='mvn') - - parser.add_argument('--model-path', required=True) - parser.add_argument('--win-length', type=float, default=1.5, - help=('window length for x-vector extraction in seconds')) - parser.add_argument('--win-shift', type=float, default=0.25, - help=('window shift for x-vector extraction in seconds')) - parser.add_argument('--snip-edges', default=False, action='store_true', - help=('If true, end effects will be handled by outputting ' - 'only windows that completely fit in the file, ' - 'and the number of windows depends on the window-length. ' - 'If false, the number of windows depends only on ' - 'the window-shift, and we reflect the data at the ends.')) - - parser.add_argument('--feat-frame-length', type=float, default=25, - help=('frame-length used to compute the acoustic features in msecs')) - parser.add_argument('--feat-frame-shift', type=float, default=10, - help=('frame-shift used to compute the acoustic features in msecs')) - parser.add_argument('--feat-snip-edges', default=False, action='store_true', - help=('If true, end effects will be handled by outputting only windows ' - 'that completely fit in the file, and the number of windows ' - 'depends on the feat-frame-length. ' - 'If false, the number of feature frames depends only on the ' - 'feat-frame-shift, and we reflect the waveform at the ends.')) - - parser.add_argument('--chunk-length', type=int, default=0, - help=('number of frames used in each forward pass of the x-vector encoder,' - 'if 0 the full utterance is used')) - - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) - - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--win-length", + type=float, + default=1.5, + help=("window length for x-vector extraction in seconds"), + ) + parser.add_argument( + "--win-shift", + type=float, + default=0.25, + help=("window shift for x-vector extraction in seconds"), + ) + parser.add_argument( + "--snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting " + "only windows that completely fit in the file, " + "and the number of windows depends on the window-length. " + "If false, the number of windows depends only on " + "the window-shift, and we reflect the data at the ends." + ), + ) + + parser.add_argument( + "--feat-frame-length", + type=float, + default=25, + help=("frame-length used to compute the acoustic features in msecs"), + ) + parser.add_argument( + "--feat-frame-shift", + type=float, + default=10, + help=("frame-shift used to compute the acoustic features in msecs"), + ) + parser.add_argument( + "--feat-snip-edges", + default=False, + action="store_true", + help=( + "If true, end effects will be handled by outputting only windows " + "that completely fit in the file, and the number of windows " + "depends on the feat-frame-length. " + "If false, the number of feature frames depends only on the " + "feat-frame-shift, and we reflect the waveform at the ends." + ), + ) + + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) # parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) # parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) extract_xvectors(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-extract-xvectors-vae-preproc.py b/hyperion/bin/torch-extract-xvectors-vae-preproc.py index 6566aa87..376de911 100755 --- a/hyperion/bin/torch-extract-xvectors-vae-preproc.py +++ b/hyperion/bin/torch-extract-xvectors-vae-preproc.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -24,17 +29,18 @@ from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_mvn(device, **kwargs): - mvn_args = MVN.filter_args(**kwargs['mvn']) - logging.info('mvn args={}'.format(mvn_args)) + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) mvn = MVN(**mvn_args) if mvn.norm_mean or mvn.norm_var: return mvn @@ -42,41 +48,50 @@ def init_mvn(device, **kwargs): def load_model(model_path, device, model_name): - logging.info('loading {}-model {}'.format(model_name, model_path)) + logging.info("loading {}-model {}".format(model_name, model_path)) model = TML.load(model_path) - logging.info('{}-model={}'.format(model_name, model)) + logging.info("{}-model={}".format(model_name, model)) model.to(device) model.eval() return model def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint( - low=min_utt_length, high=max_utt_length+1) + utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint( - low=0, high=x.shape[1]-utt_length) - x = x[:,first_frame:first_frame+utt_length] + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] logging.info( - 'extract-random-utt %s of length=%d first-frame=%d' % ( - key, x.shape[1], first_frame)) + "extract-random-utt %s of length=%d first-frame=%d" + % (key, x.shape[1], first_frame) + ) return x -def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, - vad_path_prefix, - xvec_model_path, chunk_length, embed_layer, - vae_model_path, - random_utt_length, min_utt_length, max_utt_length, - use_gpu, **kwargs): +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + xvec_model_path, + chunk_length, + embed_layer, + vae_model_path, + random_utt_length, + min_utt_length, + max_utt_length, + use_gpu, + **kwargs +): - logging.info('initializing') - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) + logging.info("initializing") + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) - xvec_model = load_model(xvec_model_path, device, 'xvector') - vae_model = load_model(vae_model_path, device, 'vae') - + xvec_model = load_model(xvec_model_path, device, "xvector") + vae_model = load_model(vae_model_path, device, "vae") + if write_num_frames_spec is not None: keys = [] info = [] @@ -97,55 +112,63 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, # logging.info('vae={}'.format(vae_model)) mse_loss = torch.nn.MSELoss() - + dr_args = DRF.filter_args(**kwargs) - logging.info('opening output stream: %s' % (output_spec)) + logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info('opening input stream: %s' % (input_spec)) + logging.info("opening input stream: %s" % (input_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) + logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - + while not reader.eof(): t1 = time.time() key, data = reader.read(1) if len(key) == 0: break t2 = time.time() - logging.info('processing utt %s' % (key[0])) + logging.info("processing utt %s" % (key[0])) x = data[0] if mvn is not None: x = mvn.normalize(x) t3 = time.time() tot_frames = x.shape[0] if vad_spec is not None: - vad = v_reader.read( - key, num_frames=x.shape[0])[0].astype( - 'bool', copy=False) + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) x = x[vad] - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % ( - key[0], x.shape[0], tot_frames, x.shape[0]/tot_frames*100)) - + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + if random_utt_length: - x = select_random_chunk( - key, x, min_utt_length, max_utt_length, rng) + x = select_random_chunk(key, x, min_utt_length, max_utt_length, rng) t4 = time.time() if x.shape[0] == 0: y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: - xx = torch.tensor(x.T[None,:], dtype=torch.get_default_dtype()) + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): xx = xx.to(device) vae_output = vae_model(xx, return_x_mean=True) - x_clean = vae_output['x_mean'] + x_clean = vae_output["x_mean"] mse = mse_loss(x_clean, xx).item() - y = xvec_model.extract_embed( - x_clean, chunk_length=chunk_length, - embed_layer=embed_layer).detach().cpu().numpy()[0] + y = ( + xvec_model.extract_embed( + x_clean, + chunk_length=chunk_length, + embed_layer=embed_layer, + ) + .detach() + .cpu() + .numpy()[0] + ) t5 = time.time() writer.write(key, [y]) @@ -153,67 +176,107 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, keys.append(key[0]) info.append(str(x.shape[0])) t6 = time.time() - logging.info(( - 'utt %s mse=%.3f total-time=%.3f read-time=%.3f mvn-time=%.3f ' - 'vad-time=%.3f embed-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key[0], mse, t6-t1, t2-t1, t3-t2, t4-t3, - t5-t4, t6-t5, x.shape[0]*1e-2/(t6-t1))) + logging.info( + ( + "utt %s mse=%.3f total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + mse, + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) if write_num_frames_spec is not None: - logging.info('writing num-frames to %s' % (write_num_frames_spec)) + logging.info("writing num-frames to %s" % (write_num_frames_spec)) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) - - + + if __name__ == "__main__": - - parser = ArgumentParser( - description='Extract x-vectors with pytorch model') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) + parser = ArgumentParser(description="Extract x-vectors with pytorch model") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) DRF.add_class_args(parser) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-num-frames', dest='write_num_frames_spec', default=None) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) # parser.add_argument('--scp-sep', default=' ', # help=('scp file field separator')) # parser.add_argument('--path-prefix', default=None, # help=('scp file_path prefix')) - parser.add_argument('--vad-path-prefix', default=None, - help=('scp file_path prefix for vad')) - - MVN.add_class_args(parser, prefix='mvn') - - parser.add_argument('--xvec-model-path', required=True) - parser.add_argument('--vae-model-path', required=True) - parser.add_argument('--chunk-length', type=int, default=0, - help=('number of frames used in each forward pass of the x-vector encoder,' - 'if 0 the full utterance is used')) - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) - - parser.add_argument('--random-utt-length', default=False, action='store_true', - help='calculates x-vector from a random chunk of the utterance') - parser.add_argument('--min-utt-length', type=int, default=500, - help=('minimum utterance length when using random utt length')) - parser.add_argument('--max-utt-length', type=int, default=12000, - help=('maximum utterance length when using random utt length')) - - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--xvec-model-path", required=True) + parser.add_argument("--vae-model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk of the utterance", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) # parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) # parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) extract_xvectors(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-extract-xvectors.py b/hyperion/bin/torch-extract-xvectors.py index 3165dd83..18bab96f 100755 --- a/hyperion/bin/torch-extract-xvectors.py +++ b/hyperion/bin/torch-extract-xvectors.py @@ -6,7 +6,12 @@ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging @@ -24,17 +29,18 @@ from hyperion.torch.utils import open_device from hyperion.torch import TorchModelLoader as TML + def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_mvn(device, **kwargs): - mvn_args = MVN.filter_args(**kwargs['mvn']) - logging.info('mvn args={}'.format(mvn_args)) + mvn_args = MVN.filter_args(**kwargs["mvn"]) + logging.info("mvn args={}".format(mvn_args)) mvn = MVN(**mvn_args) if mvn.norm_mean or mvn.norm_var: return mvn @@ -42,35 +48,44 @@ def init_mvn(device, **kwargs): def load_model(model_path, device): - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) model = TML.load(model_path) - logging.info('xvector-model={}'.format(model)) + logging.info("xvector-model={}".format(model)) model.to(device) model.eval() return model def select_random_chunk(key, x, min_utt_length, max_utt_length, rng): - utt_length = rng.randint( - low=min_utt_length, high=max_utt_length+1) + utt_length = rng.randint(low=min_utt_length, high=max_utt_length + 1) if utt_length < x.shape[1]: - first_frame = rng.randint( - low=0, high=x.shape[1]-utt_length) - x = x[:,first_frame:first_frame+utt_length] + first_frame = rng.randint(low=0, high=x.shape[1] - utt_length) + x = x[:, first_frame : first_frame + utt_length] logging.info( - 'extract-random-utt %s of length=%d first-frame=%d' % ( - key, x.shape[1], first_frame)) + "extract-random-utt %s of length=%d first-frame=%d" + % (key, x.shape[1], first_frame) + ) return x -def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, - vad_path_prefix, - model_path, chunk_length, embed_layer, - random_utt_length, min_utt_length, max_utt_length, - use_gpu, **kwargs): - - logging.info('initializing') - rng = np.random.RandomState(seed=1123581321+kwargs['part_idx']) +def extract_xvectors( + input_spec, + output_spec, + vad_spec, + write_num_frames_spec, + vad_path_prefix, + model_path, + chunk_length, + embed_layer, + random_utt_length, + min_utt_length, + max_utt_length, + use_gpu, + **kwargs +): + + logging.info("initializing") + rng = np.random.RandomState(seed=1123581321 + kwargs["part_idx"]) device = init_device(use_gpu) mvn = init_mvn(device, **kwargs) model = load_model(model_path, device) @@ -80,49 +95,55 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, info = [] dr_args = DRF.filter_args(**kwargs) - logging.info('opening output stream: %s' % (output_spec)) + logging.info("opening output stream: %s" % (output_spec)) with DWF.create(output_spec) as writer: - logging.info('opening input stream: %s' % (input_spec)) + logging.info("opening input stream: %s" % (input_spec)) with DRF.create(input_spec, **dr_args) as reader: if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) + logging.info("opening VAD stream: %s" % (vad_spec)) v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix) - + while not reader.eof(): t1 = time.time() key, data = reader.read(1) if len(key) == 0: break t2 = time.time() - logging.info('processing utt %s' % (key[0])) + logging.info("processing utt %s" % (key[0])) x = data[0] if mvn is not None: x = mvn.normalize(x) t3 = time.time() tot_frames = x.shape[0] if vad_spec is not None: - vad = v_reader.read( - key, num_frames=x.shape[0])[0].astype( - 'bool', copy=False) + vad = v_reader.read(key, num_frames=x.shape[0])[0].astype( + "bool", copy=False + ) x = x[vad] - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % ( - key[0], x.shape[0], tot_frames, x.shape[0]/tot_frames*100)) - + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % (key[0], x.shape[0], tot_frames, x.shape[0] / tot_frames * 100) + ) + if random_utt_length: - x = select_random_chunk( - key, x, min_utt_length, max_utt_length, rng) + x = select_random_chunk(key, x, min_utt_length, max_utt_length, rng) t4 = time.time() if x.shape[0] == 0: y = np.zeros((model.embed_dim,), dtype=float_cpu()) else: - xx = torch.tensor(x.T[None,:], dtype=torch.get_default_dtype()) + xx = torch.tensor(x.T[None, :], dtype=torch.get_default_dtype()) with torch.no_grad(): - y = model.extract_embed( - xx, chunk_length=chunk_length, - embed_layer=embed_layer).detach().cpu().numpy()[0] + y = ( + model.extract_embed( + xx, chunk_length=chunk_length, embed_layer=embed_layer + ) + .detach() + .cpu() + .numpy()[0] + ) t5 = time.time() writer.write(key, [y]) @@ -130,66 +151,105 @@ def extract_xvectors(input_spec, output_spec, vad_spec, write_num_frames_spec, keys.append(key[0]) info.append(str(x.shape[0])) t6 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f ' - 'vad-time=%.3f embed-time=%.3f write-time=%.3f ' - 'rt-factor=%.2f') % ( - key[0], t6-t1, t2-t1, t3-t2, t4-t3, - t5-t4, t6-t5, x.shape[0]*1e-2/(t6-t1))) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f mvn-time=%.3f " + "vad-time=%.3f embed-time=%.3f write-time=%.3f " + "rt-factor=%.2f" + ) + % ( + key[0], + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + t6 - t5, + x.shape[0] * 1e-2 / (t6 - t1), + ) + ) if write_num_frames_spec is not None: - logging.info('writing num-frames to %s' % (write_num_frames_spec)) + logging.info("writing num-frames to %s" % (write_num_frames_spec)) u2nf = Utt2Info.create(keys, info) u2nf.save(write_num_frames_spec) - - + + if __name__ == "__main__": - - parser = ArgumentParser( - description='Extracts x-vectors from features') - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--input', dest='input_spec', required=True) + parser = ArgumentParser(description="Extracts x-vectors from features") + + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--input", dest="input_spec", required=True) DRF.add_class_args(parser) - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--write-num-frames', dest='write_num_frames_spec', default=None) + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--write-num-frames", dest="write_num_frames_spec", default=None + ) # parser.add_argument('--scp-sep', default=' ', # help=('scp file field separator')) # parser.add_argument('--path-prefix', default=None, # help=('scp file_path prefix')) - parser.add_argument('--vad-path-prefix', default=None, - help=('scp file_path prefix for vad')) - - MVN.add_class_args(parser, prefix='mvn') - - parser.add_argument('--model-path', required=True) - parser.add_argument('--chunk-length', type=int, default=0, - help=('number of frames used in each forward pass of the x-vector encoder,' - 'if 0 the full utterance is used')) - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) - - parser.add_argument('--random-utt-length', default=False, action='store_true', - help='calculates x-vector from a random chunk of the utterance') - parser.add_argument('--min-utt-length', type=int, default=500, - help=('minimum utterance length when using random utt length')) - parser.add_argument('--max-utt-length', type=int, default=12000, - help=('maximum utterance length when using random utt length')) - - parser.add_argument('--output', dest='output_spec', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') + parser.add_argument( + "--vad-path-prefix", default=None, help=("scp file_path prefix for vad") + ) + + MVN.add_class_args(parser, prefix="mvn") + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--chunk-length", + type=int, + default=0, + help=( + "number of frames used in each forward pass of the x-vector encoder," + "if 0 the full utterance is used" + ), + ) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk of the utterance", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=500, + help=("minimum utterance length when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=12000, + help=("maximum utterance length when using random utt length"), + ) + + parser.add_argument("--output", dest="output_spec", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) # parser.add_argument('--part-idx', dest='part_idx', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) # parser.add_argument('--num-parts', dest='num_parts', type=int, default=1, # help=('splits the list of files in num-parts and process part_idx')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) extract_xvectors(**namespace_to_dict(args)) - diff --git a/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py b/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py index 35264dde..437c76f0 100755 --- a/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py +++ b/hyperion/bin/torch-finetune-xvec-dfr-from-wav.py @@ -5,10 +5,15 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -25,54 +30,69 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): + +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor -def init_xvector(num_classes, in_model_path, prior_model_path, - rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) +def init_xvector( + num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs +): + + xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: - logging.info('xvector network ft args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) if prior_model_path: @@ -81,10 +101,10 @@ def init_xvector(num_classes, in_model_path, prior_model_path, prior_model = model.copy() prior_model.freeze() prior_model.eval() - if train_mode == 'ft-embed-affine': + if train_mode == "ft-embed-affine": model.freeze_preembed_layers() if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model, prior_model @@ -96,26 +116,32 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') - - train_mode = kwargs['train_mode'] + set_float_cpu("float32") + + train_mode = kwargs["train_mode"] ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) - model, prior_model = init_xvector( - train_loader.dataset.num_classes, **kwargs) + model, prior_model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, prior_model, - device=device, metrics=metrics, - ddp=world_size>1, train_mode=train_mode, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + prior_model, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -198,8 +224,8 @@ def train_xvec(gpu_id, args): # reg_loss = nn.L1Loss() # else: # reg_loss = nn.MSELoss() - -# trainer = Trainer(model, feat_extractor, prior_model, optimizer, + +# trainer = Trainer(model, feat_extractor, prior_model, optimizer, # reg_layers_enc=reg_layers_enc, reg_layers_classif=reg_layers_classif, # reg_weight_enc=reg_weight_enc, reg_weight_classif=reg_weight_classif, # reg_loss=reg_loss, @@ -211,36 +237,40 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description=('Fine-tune x-vector model with deep feature loss ' - 'regularization from audio files')) + description=( + "Fine-tune x-vector model with deep feature loss " + "regularization from audio files" + ) + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) AD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") # AFF.add_argparse_args(parser, prefix='feats') # MVN.add_argparse_args(parser, prefix='mvn') - - # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', + + # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', # help='list of layers from the encoder nnet to use for regularization ') - # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', + # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', # help='list of layers from the classif nnet to use for regularization ') - # parser.add_argument('--reg-weight-enc', type=float, default=0.1, + # parser.add_argument('--reg-weight-enc', type=float, default=0.1, # help='weight for regularization from enc layers') # parser.add_argument('--reg-weight-classif', type=float, default=0.1, # help='weight for regularization from classif layers') @@ -248,8 +278,8 @@ def train_xvec(gpu_id, args): # choices=['l1', 'mse'], # help=('type of regularization loss')) - parser.add_argument('--in-model-path', required=True) - parser.add_argument('--prior-model-path') + parser.add_argument("--in-model-path", required=True) + parser.add_argument("--prior-model-path") XVec.add_finetune_args(parser) Trainer.add_class_args(parser) @@ -257,16 +287,28 @@ def train_xvec(gpu_id, args): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--train-mode', default='ft-embed-affine', - choices=['ft-full', 'ft-embed-affine'], - help=('ft-full: adapt full x-vector network' - 'ft-embed-affine: adapt affine transform before embedding')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -274,13 +316,13 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) # config_logger(args.verbose) @@ -291,4 +333,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-finetune-xvec-dfr.py b/hyperion/bin/torch-finetune-xvec-dfr.py index e73dcd09..a26c14fb 100755 --- a/hyperion/bin/torch-finetune-xvec-dfr.py +++ b/hyperion/bin/torch-finetune-xvec-dfr.py @@ -7,10 +7,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -27,41 +32,45 @@ from hyperion.torch import TorchModelLoader as TML -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader -def init_xvector(num_classes, in_model_path, prior_model_path, - rank, train_mode, **kwargs): +def init_xvector( + num_classes, in_model_path, prior_model_path, rank, train_mode, **kwargs +): - xvec_args = XVec.filter_finetune_args(**kwargs) + xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: - logging.info('xvector network ft args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) if prior_model_path: @@ -70,10 +79,10 @@ def init_xvector(num_classes, in_model_path, prior_model_path, prior_model = model.copy() prior_model.freeze() prior_model.eval() - if train_mode == 'ft-embed-affine': + if train_mode == "ft-embed-affine": model.freeze_preembed_layers() if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model, prior_model @@ -84,26 +93,30 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") - train_mode = kwargs['train_mode'] + train_mode = kwargs["train_mode"] ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) - model, prior_model = init_xvector( - train_loader.dataset.num_classes, **kwargs) + model, prior_model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - - trainer = Trainer(model, prior_model, - device=device, metrics=metrics, - ddp=world_size>1, - train_mode=train_mode, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + + trainer = Trainer( + model, + prior_model, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -115,7 +128,7 @@ def train_xvec(gpu_id, args): # prior_model_path, # reg_layers_enc, reg_layers_classif, # reg_weight_enc, reg_weight_classif, reg_loss, -# num_gpus, resume, num_workers, +# num_gpus, resume, num_workers, # train_mode, **kwargs): # set_float_cpu('float32') @@ -172,42 +185,43 @@ def train_xvec(gpu_id, args): # reg_loss = nn.L1Loss() # else: # reg_loss = nn.MSELoss() - -# trainer = Trainer(model, prior_model, optimizer, -# reg_layers_enc=reg_layers_enc, + +# trainer = Trainer(model, prior_model, optimizer, +# reg_layers_enc=reg_layers_enc, # reg_layers_classif=reg_layers_classif, -# reg_weight_enc=reg_weight_enc, +# reg_weight_enc=reg_weight_enc, # reg_weight_classif=reg_weight_classif, # reg_loss=reg_loss, # device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, +# data_parallel=(num_gpus>1), train_mode=train_mode, # **trn_args) # if resume: # trainer.load_last_checkpoint() # trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Fine-tune x-vector model with deep feature loss regularization') + description="Fine-tune x-vector model with deep feature loss regularization" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', dest='data_rspec', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", dest="data_rspec", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', + # parser.add_argument('--reg-layers-enc', type=int, default=None, nargs='+', # help='list of layers from the encoder nnet to use for regularization ') - # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', + # parser.add_argument('--reg-layers-classif', type=int, default=None, nargs='+', # help='list of layers from the classif nnet to use for regularization ') - # parser.add_argument('--reg-weight-enc', type=float, default=0.1, + # parser.add_argument('--reg-weight-enc', type=float, default=0.1, # help='weight for regularization from enc layers') # parser.add_argument('--reg-weight-classif', type=float, default=0.1, # help='weight for regularization from classif layers') @@ -215,26 +229,37 @@ def train_xvec(gpu_id, args): # choices=['l1', 'mse'], # help=('type of regularization loss')) - parser.add_argument('--in-model-path', required=True) - parser.add_argument('--prior-model-path') + parser.add_argument("--in-model-path", required=True) + parser.add_argument("--prior-model-path") XVec.add_finetune_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--train-mode', default='ft-embed-affine', - choices=['ft-full', 'ft-embed-affine'], - help=('ft-full: adapt full x-vector network' - 'ft-embed-affine: adapt affine transform before embedding')) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -242,13 +267,13 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) # config_logger(args.verbose) @@ -259,4 +284,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-finetune-xvec-from-wav.py b/hyperion/bin/torch-finetune-xvec-from-wav.py index 6a39dde1..e33d9b8e 100755 --- a/hyperion/bin/torch-finetune-xvec-from-wav.py +++ b/hyperion/bin/torch-finetune-xvec-from-wav.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,59 +31,72 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF from hyperion.torch import TorchModelLoader as TML -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): + +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) + xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: - logging.info('xvector network ft args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) - if train_mode == 'ft-embed-affine': + if train_mode == "ft-embed-affine": model.freeze_preembed_layers() if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model @@ -90,13 +108,13 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') - - train_mode = kwargs['train_mode'] + set_float_cpu("float32") + + train_mode = kwargs["train_mode"] ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) @@ -104,11 +122,17 @@ def train_xvec(gpu_id, args): trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, train_mode=train_mode, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -116,7 +140,6 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() - # (audio_path, train_list, val_list, # train_aug_cfg, val_aug_cfg, # in_model_path, num_gpus, resume, num_workers, @@ -180,8 +203,8 @@ def train_xvec(gpu_id, args): # optimizer = OF.create(model.parameters(), **opt_args) # lr_sch = LRSF.create(optimizer, **lrsch_args) # metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, feat_extractor, optimizer, + +# trainer = Trainer(model, feat_extractor, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, # data_parallel=(num_gpus>1), train_mode=train_mode, # **trn_args) @@ -190,26 +213,26 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Fine-tune x-vector model from audio files') + parser = ArgumentParser(description="Fine-tune x-vector model from audio files") - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') - parser.add_argument('--in-model-path', required=True) + AF.add_class_args(parser, prefix="feats") + parser.add_argument("--in-model-path", required=True) XVec.add_finetune_args(parser) Trainer.add_class_args(parser) @@ -217,16 +240,26 @@ def train_xvec(gpu_id, args): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--train-mode', default='ft-embed-affine', - choices=['ft-full', 'ft-embed-affine'], - help=('ft-full: adapt full x-vector network' - 'ft-embed-affine: adapt affine transform before embedding')) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -234,13 +267,13 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) # args = parser.parse_args() @@ -252,4 +285,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-finetune-xvec.py b/hyperion/bin/torch-finetune-xvec.py index 55934eaf..ec6386c8 100755 --- a/hyperion/bin/torch-finetune-xvec.py +++ b/hyperion/bin/torch-finetune-xvec.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -25,45 +30,48 @@ from hyperion.torch import TorchModelLoader as TML -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_xvector(num_classes, in_model_path, rank, train_mode, **kwargs): - xvec_args = XVec.filter_finetune_args(**kwargs) + xvec_args = XVec.filter_finetune_args(**kwargs) if rank == 0: - logging.info('xvector network ft args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network ft args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = TML.load(in_model_path) model.rebuild_output_layer(**xvec_args) - if train_mode == 'ft-embed-affine': + if train_mode == "ft-embed-affine": model.freeze_preembed_layers() if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model @@ -74,24 +82,28 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") - train_mode = kwargs['train_mode'] + train_mode = kwargs["train_mode"] ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, - train_mode=train_mode, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + device=device, + metrics=metrics, + ddp=world_size > 1, + train_mode=train_mode, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -99,7 +111,6 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() - # (data_rspec, train_list, val_list, in_model_path, # num_gpus, resume, num_workers, train_mode, **kwargs): @@ -146,49 +157,60 @@ def train_xvec(gpu_id, args): # optimizer = OF.create(model.parameters(), **opt_args) # lr_sch = LRSF.create(optimizer, **lrsch_args) # metrics = { 'acc': CategoricalAccuracy() } - + # trainer = Trainer(model, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, -# data_parallel=(num_gpus>1), train_mode=train_mode, +# data_parallel=(num_gpus>1), train_mode=train_mode, # **trn_args) # if resume: # trainer.load_last_checkpoint() # trainer.fit(train_loader, test_loader) +if __name__ == "__main__": -if __name__ == '__main__': + parser = ArgumentParser(description="Fine-tune x-vector model") - parser = ArgumentParser( - description='Fine-tune x-vector model') - - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-model-path', required=True) + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument("--in-model-path", required=True) XVec.add_finetune_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--train-mode', default='ft-embed-affine', - choices=['ft-full', 'ft-embed-affine'], - help=('ft-full: adapt full x-vector network' - 'ft-embed-affine: adapt affine transform before embedding')) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--train-mode", + default="ft-embed-affine", + choices=["ft-full", "ft-embed-affine"], + help=( + "ft-full: adapt full x-vector network" + "ft-embed-affine: adapt affine transform before embedding" + ), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -196,13 +218,13 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) # args = parser.parse_args() @@ -214,4 +236,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py b/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py index 728bded0..274bdf32 100755 --- a/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py +++ b/hyperion/bin/torch-generate-adv-attacks-xvector-classif.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging from pathlib import Path @@ -30,28 +35,27 @@ from hyperion.torch.adv_attacks import RandomAttackFactory + def read_utt_list(list_file, class2int_file, part_idx, num_parts): - logging.info('reading utt list %s' % (list_file)) + logging.info("reading utt list %s" % (list_file)) utt_list = Utt2Info.load(list_file) utt_list = utt_list.split(part_idx, num_parts) - logging.info('reading class2int-file %s' % (class2int_file)) - class_info = pd.read_csv(class2int_file, header=None, sep=' ') - class2idx = {str(k):i for i,k in enumerate(class_info[0])} + logging.info("reading class2int-file %s" % (class2int_file)) + class_info = pd.read_csv(class2int_file, header=None, sep=" ") + class2idx = {str(k): i for i, k in enumerate(class_info[0])} class_idx = np.array([class2idx[k] for k in utt_list.info], dtype=int) keys = utt_list.key class_names = utt_list.info - return keys, class_names , class_idx + return keys, class_names, class_idx class MyModel(nn.Module): - def __init__(self, feat_extractor, xvector_model): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model self.vad = None - def forward(self, s): f = self.feat_extractor(s) if self.vad is not None: @@ -60,29 +64,29 @@ def forward(self, s): if n_vad_frames > n_feat_frames: self.vad = self.vad[:n_feat_frames] elif n_vad_frames < n_feat_frames: - f = f[:,:n_vad_frames] + f = f[:, :n_vad_frames] - f = f[:,self.vad] + f = f[:, self.vad] - f = f.transpose(1,2).contiguous() + f = f.transpose(1, 2).contiguous() score = self.xvector_model(f) return score def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device def init_model(model_path, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) # feat_args = AFF.filter_args(prefix='feats', **kwargs) # logging.info('initializing feature extractor args={}'.format(feat_args)) @@ -94,10 +98,10 @@ def init_model(model_path, **kwargs): # logging.info('initializing short-time mvn args={}'.format(mvn_args)) # mvn = MVN(**mvn_args) - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) xvector_model = TML.load(model_path) xvector_model.freeze() - logging.info('xvector-model={}'.format(xvector_model)) + logging.info("xvector-model={}".format(xvector_model)) model = MyModel(feat_extractor, xvector_model) model.eval() @@ -105,58 +109,77 @@ def init_model(model_path, **kwargs): def init_attack_factory(wav_scale=1, **kwargs): - attacks_args = RandomAttackFactory.filter_args(**kwargs['attacks']) - extra_args = {'eps_scale': wav_scale, - 'range_min': -wav_scale, - 'range_max': wav_scale, - 'loss': nn.functional.cross_entropy, - 'time_dim': 1} + attacks_args = RandomAttackFactory.filter_args(**kwargs["attacks"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.cross_entropy, + "time_dim": 1, + } attacks_args.update(extra_args) - logging.info('attacks args={}'.format(attacks_args)) + logging.info("attacks args={}".format(attacks_args)) attack_factory = RandomAttackFactory(**attacks_args) return attack_factory def select_random_chunk(key, s, fs, min_utt_length, max_utt_length): utt_length = torch.randint( - low=min_utt_length * fs, high=max_utt_length * fs+1, size=(1,)).item() + low=min_utt_length * fs, high=max_utt_length * fs + 1, size=(1,) + ).item() if utt_length < len(s): - first_sample = torch.randint( - low=0, high=len(s)-utt_length, size=(1,)).item() - s = s[first_sample:first_sample+utt_length] + first_sample = torch.randint(low=0, high=len(s) - utt_length, size=(1,)).item() + s = s[first_sample : first_sample + utt_length] logging.info( - 'extract-random-utt %s of length=%d first-sample=%d' % ( - key, len(s), first_sample)) + "extract-random-utt %s of length=%d first-sample=%d" + % (key, len(s), first_sample) + ) return s - -def generate_attacks(wav_file, list_file, vad_spec, vad_path_prefix, - class2int_file, model_path, - output_wav_dir, attack_info_file, attack_tag, - random_utt_length, min_utt_length, max_utt_length, - random_seed, p_attack, save_failed, save_benign, - use_gpu, part_idx, num_parts, **kwargs): +def generate_attacks( + wav_file, + list_file, + vad_spec, + vad_path_prefix, + class2int_file, + model_path, + output_wav_dir, + attack_info_file, + attack_tag, + random_utt_length, + min_utt_length, + max_utt_length, + random_seed, + p_attack, + save_failed, + save_benign, + use_gpu, + part_idx, + num_parts, + **kwargs +): device = init_device(use_gpu) model = init_model(model_path, **kwargs) model.to(device) - logging.info('opening audio read stream: %s' % (wav_file)) + logging.info("opening audio read stream: %s" % (wav_file)) audio_args = AR.filter_args(**kwargs) audio_reader = AR(wav_file) wav_scale = audio_reader.wav_scale - logging.info('opening audio write stream: %s' % (output_wav_dir)) - audio_writer = AW(output_wav_dir, audio_format='flac') + logging.info("opening audio write stream: %s" % (output_wav_dir)) + audio_writer = AW(output_wav_dir, audio_format="flac") if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=' ') + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") - keys, class_names, class_ids = read_utt_list(list_file, class2int_file, part_idx, num_parts) + keys, class_names, class_ids = read_utt_list( + list_file, class2int_file, part_idx, num_parts + ) attack_factory = init_attack_factory(**kwargs) attacks_info = {} @@ -166,36 +189,44 @@ def generate_attacks(wav_file, list_file, vad_spec, vad_path_prefix, class_id = class_ids[i] t1 = time.time() - logging.info('reading utt %s' % (key)) + logging.info("reading utt %s" % (key)) s, fs = audio_reader.read([key]) s = s[0] fs = fs[0] - torch.manual_seed(random_seed+int(s[0])) #this is to make results reproducible + torch.manual_seed( + random_seed + int(s[0]) + ) # this is to make results reproducible p = torch.rand(1).item() if p > p_attack: - logging.info('skipping attack for utt %s' % (key)) + logging.info("skipping attack for utt %s" % (key)) continue if random_utt_length: - s = select_random_chunk( - key, s, fs, min_utt_length, max_utt_length) + s = select_random_chunk(key, s, fs, min_utt_length, max_utt_length) if save_benign: s_benign = s - s = torch.as_tensor(s[None,:], dtype=torch.get_default_dtype()).to(device) + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) target = torch.as_tensor([class_id], dtype=torch.long).to(device) if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool).to(device) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) model.vad = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % ( - key.seg_set[j], speech_frames, tot_frames, - speech_frames/tot_frames*100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) t2 = time.time() with torch.no_grad(): @@ -203,8 +234,7 @@ def generate_attacks(wav_file, list_file, vad_spec, vad_path_prefix, _, pred = torch.max(score_benign, dim=1) if pred[0] != class_id: - logging.info( - 'utt %s failed benign classification, skipping...' % (key)) + logging.info("utt %s failed benign classification, skipping..." % (key)) continue t3 = time.time() @@ -222,9 +252,8 @@ def generate_attacks(wav_file, list_file, vad_spec, vad_path_prefix, success = True if success or save_failed: - key_attack = '%s-%s' % (key, attack_tag) - logging.info( - 'utt %s attack successful' % (key)) + key_attack = "%s-%s" % (key, attack_tag) + logging.info("utt %s attack successful" % (key)) stats_ij = compute_stats_adv_attack(s, s_adv) stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] @@ -232,45 +261,60 @@ def generate_attacks(wav_file, list_file, vad_spec, vad_path_prefix, s_adv = s_adv.cpu().numpy()[0] wav_attack = audio_writer.write(key_attack, s_adv, fs)[0] if save_benign: - key_benign = '%s-benign' % (key_attack) + key_benign = "%s-benign" % (key_attack) wav_benign = audio_writer.write(key_benign, s_benign, fs)[0] else: key_benign = key - wav_benign = '' - - attack_info.update({ - 'attack_tag': attack_tag, - 'wav_path': wav_attack, - 'class_name': class_names[i], - 'class_id': int(class_id), - 'key_benign': key_benign, - 'wav_benign': wav_benign, - 'snr': stats_ij[0], - 'px': stats_ij[1], 'pn': stats_ij[2], - 'x_l2': stats_ij[3], 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1], - 'success': success}) + wav_benign = "" + + attack_info.update( + { + "attack_tag": attack_tag, + "wav_path": wav_attack, + "class_name": class_names[i], + "class_id": int(class_id), + "key_benign": key_benign, + "wav_benign": wav_benign, + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + "success": success, + } + ) attacks_info[key_attack] = attack_info - + else: - logging.info( - 'utt %s attack failed, skipping...' % (key)) + logging.info("utt %s attack failed, skipping..." % (key)) t6 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f ' - 'eval-benign-time=%.3f attack-time=%.3f eval-attack-time=%3f ' - 'rt-factor=%.4f') % ( - key, t6-t1, t2-t1, t3-t2, t4-t3, t5-t4, - s.shape[1]/fs/(t6-t1))) - - logging.info('saving attack info to %s' % (attack_info_file)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f " + "eval-benign-time=%.3f attack-time=%.3f eval-attack-time=%3f " + "rt-factor=%.4f" + ) + % ( + key, + t6 - t1, + t2 - t1, + t3 - t2, + t4 - t3, + t5 - t4, + s.shape[1] / fs / (t6 - t1), + ) + ) + + logging.info("saving attack info to %s" % (attack_info_file)) Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) - - with open(attack_info_file, 'w') as f: - #only save if we have successful attacks + + with open(attack_info_file, "w") as f: + # only save if we have successful attacks if attacks_info: yaml.dump(attacks_info, f, sort_keys=True) @@ -278,58 +322,99 @@ def generate_attacks(wav_file, list_file, vad_spec, vad_path_prefix, if __name__ == "__main__": parser = ArgumentParser( - description='Generate Attacks for speaker classification with x-vectors') + description="Generate Attacks for speaker classification with x-vectors" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--wav-file', required=True) - parser.add_argument('--list-file', required=True) - parser.add_argument('--class2int-file', required=True) - parser.add_argument('--attack-tag', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--wav-file", required=True) + parser.add_argument("--list-file", required=True) + parser.add_argument("--class2int-file", required=True) + parser.add_argument("--attack-tag", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') - - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', dest='vad_path_prefix', default=None, - help=('scp file_path prefix for vad')) - - parser.add_argument('--model-path', required=True) - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') - - RandomAttackFactory.add_class_args(parser, prefix='attacks') - - parser.add_argument('--part-idx', default=1, type=int, - help=('part index')) - parser.add_argument('--num-parts', default=1, type=int, - help=('number of parts in which we divide the list ' - 'to run evaluation in parallel')) - - parser.add_argument('--output-wav-dir', default=None, - help='output path of adv signals') - parser.add_argument('--attack-info-file', default=None, - help='output path of to save information about the generated attacks') - parser.add_argument('--random-seed', default=1234, type=int, - help='random seed for pytorch') - - parser.add_argument('--random-utt-length', default=False, action='store_true', - help='calculates x-vector from a random chunk') - parser.add_argument('--min-utt-length', type=int, default=5, - help=('minimum utterance length (in secs) when using random utt length')) - parser.add_argument('--max-utt-length', type=int, default=120, - help=('maximum utterance length (in secs) when using random utt length')) - - parser.add_argument('--p-attack', type=float, default=1, - help=('probability of generating an attack for a given utterance')) - parser.add_argument('--save-failed', default=False, action='store_true', - help=('save failed attacks also')) - parser.add_argument('--save-benign', default=False, action='store_true', - help=('save a copy of the benign sample')) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + RandomAttackFactory.add_class_args(parser, prefix="attacks") + + parser.add_argument("--part-idx", default=1, type=int, help=("part index")) + parser.add_argument( + "--num-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument( + "--output-wav-dir", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--attack-info-file", + default=None, + help="output path of to save information about the generated attacks", + ) + parser.add_argument( + "--random-seed", default=1234, type=int, help="random seed for pytorch" + ) + + parser.add_argument( + "--random-utt-length", + default=False, + action="store_true", + help="calculates x-vector from a random chunk", + ) + parser.add_argument( + "--min-utt-length", + type=int, + default=5, + help=("minimum utterance length (in secs) when using random utt length"), + ) + parser.add_argument( + "--max-utt-length", + type=int, + default=120, + help=("maximum utterance length (in secs) when using random utt length"), + ) + + parser.add_argument( + "--p-attack", + type=float, + default=1, + help=("probability of generating an attack for a given utterance"), + ) + parser.add_argument( + "--save-failed", + default=False, + action="store_true", + help=("save failed attacks also"), + ) + parser.add_argument( + "--save-benign", + default=False, + action="store_true", + help=("save a copy of the benign sample"), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py b/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py index dfe926ec..58f73b00 100755 --- a/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py +++ b/hyperion/bin/torch-generate-adv-attacks-xvector-verif.py @@ -5,7 +5,12 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging from pathlib import Path @@ -34,10 +39,11 @@ from hyperion.torch.adv_attacks import RandomAttackFactory -class MyModel(nn.Module): - def __init__(self, feat_extractor, xvector_model, embed_layer=None, - calibrator=None, sigma=0): +class MyModel(nn.Module): + def __init__( + self, feat_extractor, xvector_model, embed_layer=None, calibrator=None, sigma=0 + ): super().__init__() self.feat_extractor = feat_extractor self.xvector_model = xvector_model @@ -47,11 +53,10 @@ def __init__(self, feat_extractor, xvector_model, embed_layer=None, self.calibrator = calibrator self.sigma = sigma - def forward(self, s_t): # print('sigma0=', self.sigma) if self.sigma > 0: - s_t = s_t + self.sigma*torch.randn_like(s_t) + s_t = s_t + self.sigma * torch.randn_like(s_t) # print('sigma1=', self.sigma) f_t = self.feat_extractor(s_t) if self.vad_t is not None: @@ -60,11 +65,11 @@ def forward(self, s_t): if n_vad_frames > n_feat_frames: self.vad_t = self.vad_t[:n_feat_frames] elif n_vad_frames < n_feat_frames: - f_t = f_t[:,:n_vad_frames] + f_t = f_t[:, :n_vad_frames] - f_t = f_t[:,self.vad_t] + f_t = f_t[:, self.vad_t] - f_t = f_t.transpose(1,2).contiguous() + f_t = f_t.transpose(1, 2).contiguous() x_t = self.xvector_model.extract_embed(f_t, embed_layer=self.embed_layer) x_t = l2_norm(x_t) x_e = l2_norm(self.x_e) @@ -85,7 +90,7 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): x_e = r.read(enroll.key, squeeze=True) f, idx = ismember(key.model_set, enroll.info) - + assert np.all(f) x_e = x_e[idx] @@ -93,16 +98,16 @@ def read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts): def init_model(model_path, embed_layer, cal_file, threshold, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + feat_args = AF.filter_args(**kwargs["feats"]) + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=False, **feat_args) - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) - logging.info('loading model {}'.format(model_path)) + logging.info("loading model {}".format(model_path)) xvector_model = TML.load(model_path) xvector_model.freeze() - logging.info('xvector-model={}'.format(xvector_model)) + logging.info("xvector-model={}".format(xvector_model)) # feat_args = AFF.filter_args(prefix='feats', **kwargs) # logging.info('initializing feature extractor args={}'.format(feat_args)) @@ -120,11 +125,11 @@ def init_model(model_path, embed_layer, cal_file, threshold, **kwargs): calibrator = None if cal_file is not None: - logging.info('loading calibration params {}'.format(cal_file)) + logging.info("loading calibration params {}".format(cal_file)) lr = LR.load(cal_file) - #subting the threshold here will put the decision threshold in 0 - #some attacks use thr=0 to decide if the attack is succesful - calibrator = Calibrator(lr.A[0,0], lr.b[0]-threshold) + # subting the threshold here will put the decision threshold in 0 + # some attacks use thr=0 to decide if the attack is succesful + calibrator = Calibrator(lr.A[0, 0], lr.b[0] - threshold) model = MyModel(feat_extractor, xvector_model, embed_layer, calibrator) model.eval() @@ -132,23 +137,25 @@ def init_model(model_path, embed_layer, cal_file, threshold, **kwargs): def init_attack_factory(wav_scale=1, **kwargs): - attacks_args = RandomAttackFactory.filter_args(**kwargs['attacks']) - extra_args = {'eps_scale': wav_scale, - 'range_min': -wav_scale, - 'range_max': wav_scale, - 'loss': nn.functional.binary_cross_entropy_with_logits, - 'time_dim': 1} + attacks_args = RandomAttackFactory.filter_args(**kwargs["attacks"]) + extra_args = { + "eps_scale": wav_scale, + "range_min": -wav_scale, + "range_max": wav_scale, + "loss": nn.functional.binary_cross_entropy_with_logits, + "time_dim": 1, + } attacks_args.update(extra_args) - logging.info('attacks args={}'.format(attacks_args)) + logging.info("attacks args={}".format(attacks_args)) attack_factory = RandomAttackFactory(**attacks_args) return attack_factory def init_device(use_gpu): - set_float_cpu('float32') + set_float_cpu("float32") num_gpus = 1 if use_gpu else 0 - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) return device @@ -166,13 +173,28 @@ def skip_attack(is_target, p_tar_attack, p_non_attack): def generate_attacks( - v_file, key_file, enroll_file, test_wav_file, - vad_spec, vad_path_prefix, - model_path, embed_layer, cal_file, threshold, - output_wav_dir, attack_info_file, attack_tag, - p_tar_attack, p_non_attack, save_failed, - use_gpu, seg_part_idx, num_seg_parts, random_seed, - **kwargs): + v_file, + key_file, + enroll_file, + test_wav_file, + vad_spec, + vad_path_prefix, + model_path, + embed_layer, + cal_file, + threshold, + output_wav_dir, + attack_info_file, + attack_tag, + p_tar_attack, + p_non_attack, + save_failed, + use_gpu, + seg_part_idx, + num_seg_parts, + random_seed, + **kwargs +): device = init_device(use_gpu) model = init_model(model_path, embed_layer, cal_file, threshold, **kwargs) @@ -181,73 +203,85 @@ def generate_attacks( tar = torch.as_tensor([1], dtype=torch.float).to(device) non = torch.as_tensor([0], dtype=torch.float).to(device) - logging.info('loading key and enrollment x-vectors') + logging.info("loading key and enrollment x-vectors") key, x_e = read_data(v_file, key_file, enroll_file, seg_part_idx, num_seg_parts) x_e = torch.as_tensor(x_e, dtype=torch.get_default_dtype()) - logging.info('opening audio read stream: %s' % (test_wav_file)) + logging.info("opening audio read stream: %s" % (test_wav_file)) audio_args = AR.filter_args(**kwargs) audio_reader = AR(test_wav_file) wav_scale = audio_reader.wav_scale - logging.info('opening audio write stream: %s' % (output_wav_dir)) - audio_writer = AW(output_wav_dir, audio_format='flac') + logging.info("opening audio write stream: %s" % (output_wav_dir)) + audio_writer = AW(output_wav_dir, audio_format="flac") if vad_spec is not None: - logging.info('opening VAD stream: %s' % (vad_spec)) - v_reader = VRF.create( - vad_spec, path_prefix=vad_path_prefix, scp_sep=' ') + logging.info("opening VAD stream: %s" % (vad_spec)) + v_reader = VRF.create(vad_spec, path_prefix=vad_path_prefix, scp_sep=" ") attack_factory = init_attack_factory(**kwargs) attacks_info = {} for j in range(key.num_tests): t1 = time.time() - logging.info('scoring test utt %s' % (key.seg_set[j])) + logging.info("scoring test utt %s" % (key.seg_set[j])) s, fs = audio_reader.read([key.seg_set[j]]) s = s[0] fs = fs[0] - torch.manual_seed(random_seed+int(s[0])) #this is to make results reproducible - s = torch.as_tensor(s[None,:], dtype=torch.get_default_dtype()).to(device) - + torch.manual_seed( + random_seed + int(s[0]) + ) # this is to make results reproducible + s = torch.as_tensor(s[None, :], dtype=torch.get_default_dtype()).to(device) + if vad_spec is not None: vad = v_reader.read([key.seg_set[j]])[0] tot_frames = len(vad) speech_frames = np.sum(vad) - vad = torch.as_tensor(vad.astype(np.bool, copy=False), - dtype=torch.bool).to(device) + vad = torch.as_tensor(vad.astype(np.bool, copy=False), dtype=torch.bool).to( + device + ) model.vad_t = vad - logging.info('utt %s detected %d/%d (%.2f %%) speech frames' % ( - key.seg_set[j], speech_frames, tot_frames, - speech_frames/tot_frames*100)) + logging.info( + "utt %s detected %d/%d (%.2f %%) speech frames" + % ( + key.seg_set[j], + speech_frames, + tot_frames, + speech_frames / tot_frames * 100, + ) + ) t2 = time.time() trial_time = 0 num_trials = 0 for i in range(key.num_models): - trial_id='%s-%s' % (key.model_set[i], key.seg_set[j]) - if key.tar[i,j] or key.non[i,j]: + trial_id = "%s-%s" % (key.model_set[i], key.seg_set[j]) + if key.tar[i, j] or key.non[i, j]: t3 = time.time() - if skip_attack(key.tar[i,j], p_tar_attack, p_non_attack): - logging.info('skipping attack for tar trial %s' % (trial_id)) + if skip_attack(key.tar[i, j], p_tar_attack, p_non_attack): + logging.info("skipping attack for tar trial %s" % (trial_id)) continue model.x_e = x_e[i].to(device) with torch.no_grad(): - score_benign = model(s) + score_benign = model(s) - if key.tar[i,j] and score_benign < 0: + if key.tar[i, j] and score_benign < 0: logging.info( - 'target trial %s failed benign classification, skipping...' % (trial_id)) + "target trial %s failed benign classification, skipping..." + % (trial_id) + ) continue - elif key.non[i,j] and score_benign > 0: + elif key.non[i, j] and score_benign > 0: logging.info( - 'non-target trial %s failed benign classification, skipping...' % (trial_id)) + "non-target trial %s failed benign classification, skipping..." + % (trial_id) + ) continue attack = attack_factory.sample_attack(model) - if key.tar[i,j]: + if key.tar[i, j]: t = non if attack.targeted else tar else: t = tar if attack.targeted else non @@ -259,63 +293,80 @@ def generate_attacks( score_adv = model(s_adv) t4 = time.time() - trial_time += (t4 - t3) + trial_time += t4 - t3 num_trials += 1 - success=True - if key.tar[i,j] and score_adv > 0: - success=False + success = True + if key.tar[i, j] and score_adv > 0: + success = False if not save_failed: logging.info( - 'attack on target trial %s failed, skipping...' % (trial_id)) + "attack on target trial %s failed, skipping..." % (trial_id) + ) continue - elif key.non[i,j] and score_adv < 0: - success=False + elif key.non[i, j] and score_adv < 0: + success = False if not save_failed: logging.info( - 'attack on non-target trial %s failed benign classification, skipping...' % (trial_id)) + "attack on non-target trial %s failed benign classification, skipping..." + % (trial_id) + ) continue if success: - logging.info( - 'attack on trial %s successful' % (trial_id)) - + logging.info("attack on trial %s successful" % (trial_id)) + stats_ij = compute_stats_adv_attack(s, s_adv) stats_ij = [float(stat.detach().cpu().numpy()[0]) for stat in stats_ij] - + s_adv = s_adv.cpu().numpy()[0] - key_attack='%s-%s' % (trial_id, attack_tag) + key_attack = "%s-%s" % (trial_id, attack_tag) output_wav = audio_writer.write(key_attack, s_adv, fs) - attack_info.update({ - 'attack_tag': attack_tag, - 'wav_path': output_wav[0], - 'class_name': 'target' if key.tar[i,j] else 'non-target', - 'class_id': int(key.tar[i,j]), - 'key_benign': trial_id, - 'enroll': str(key.model_set[i]), - 'test_benign': str(key.seg_set[j]), - 'snr': stats_ij[0], - 'px': stats_ij[1], 'pn': stats_ij[2], - 'x_l2': stats_ij[3], 'x_linf': stats_ij[4], - 'n_l0': stats_ij[5], - 'n_l2': stats_ij[6], 'n_linf': stats_ij[7], - 'num_samples': s.shape[-1], - 'success': success}) + attack_info.update( + { + "attack_tag": attack_tag, + "wav_path": output_wav[0], + "class_name": "target" if key.tar[i, j] else "non-target", + "class_id": int(key.tar[i, j]), + "key_benign": trial_id, + "enroll": str(key.model_set[i]), + "test_benign": str(key.seg_set[j]), + "snr": stats_ij[0], + "px": stats_ij[1], + "pn": stats_ij[2], + "x_l2": stats_ij[3], + "x_linf": stats_ij[4], + "n_l0": stats_ij[5], + "n_l2": stats_ij[6], + "n_linf": stats_ij[7], + "num_samples": s.shape[-1], + "success": success, + } + ) attacks_info[key_attack] = attack_info if num_trials > 0: trial_time /= num_trials t7 = time.time() - logging.info(( - 'utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d ' - 'rt-factor=%.4f') % ( - key.seg_set[j], t7-t1, t2-t1, trial_time, num_trials, - num_trials*len(s)/fs/(t7-t1))) - - logging.info('saving attack info to %s' % (attack_info_file)) + logging.info( + ( + "utt %s total-time=%.3f read-time=%.3f trial-time=%.3f n_trials=%d " + "rt-factor=%.4f" + ) + % ( + key.seg_set[j], + t7 - t1, + t2 - t1, + trial_time, + num_trials, + num_trials * len(s) / fs / (t7 - t1), + ) + ) + + logging.info("saving attack info to %s" % (attack_info_file)) Path(attack_info_file).parent.mkdir(parents=True, exist_ok=True) - - with open(attack_info_file, 'w') as f: - #only save if we have successful attacks + + with open(attack_info_file, "w") as f: + # only save if we have successful attacks if attacks_info: yaml.dump(attacks_info, f, sort_keys=True) @@ -323,59 +374,94 @@ def generate_attacks( if __name__ == "__main__": parser = ArgumentParser( - description='Generate Attacks for speaker verification with x-vectors+cos+calibration') + description="Generate Attacks for speaker verification with x-vectors+cos+calibration" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--v-file', required=True) - parser.add_argument('--key-file', default=None) - parser.add_argument('--enroll-file', required=True) - parser.add_argument('--test-wav-file', required=True) - parser.add_argument('--attack-tag', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--v-file", required=True) + parser.add_argument("--key-file", default=None) + parser.add_argument("--enroll-file", required=True) + parser.add_argument("--test-wav-file", required=True) + parser.add_argument("--attack-tag", required=True) AR.add_class_args(parser) - AF.add_class_args(parser, prefix='feats') - - parser.add_argument('--vad', dest='vad_spec', default=None) - parser.add_argument('--vad-path-prefix', dest='vad_path_prefix', default=None, - help=('scp file_path prefix for vad')) - - parser.add_argument('--model-path', required=True) - parser.add_argument('--embed-layer', type=int, default=None, - help=('classifier layer to get the embedding from,' - 'if None the layer set in training phase is used')) - - parser.add_argument('--use-gpu', default=False, action='store_true', - help='extract xvectors in gpu') - - parser.add_argument('--cal-file', default=None, help='score calibration file') - parser.add_argument('--threshold', default=0, type=float, help='decision threshold') - - RandomAttackFactory.add_class_args(parser, prefix='attacks') - - parser.add_argument('--seg-part-idx', default=1, type=int, - help=('test part index')) - parser.add_argument('--num-seg-parts', default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument('--output-wav-dir', default=None, - help='output path of adv signals') - parser.add_argument('--attack-info-file', default=None, - help='output path of to save information about the generated attacks') - parser.add_argument('--random-seed', default=1234, type=int, - help='random seed for pytorch') - - parser.add_argument('--p-tar-attack', type=float, default=1, - help=('probability of generating an attack for a target trial')) - parser.add_argument('--p-non-attack', type=float, default=1, - help=('probability of generating an attack for a non-target trial')) - parser.add_argument('--save-failed', default=False, action='store_true', - help=('save failed attacks also')) - - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + AF.add_class_args(parser, prefix="feats") + + parser.add_argument("--vad", dest="vad_spec", default=None) + parser.add_argument( + "--vad-path-prefix", + dest="vad_path_prefix", + default=None, + help=("scp file_path prefix for vad"), + ) + + parser.add_argument("--model-path", required=True) + parser.add_argument( + "--embed-layer", + type=int, + default=None, + help=( + "classifier layer to get the embedding from," + "if None the layer set in training phase is used" + ), + ) + + parser.add_argument( + "--use-gpu", default=False, action="store_true", help="extract xvectors in gpu" + ) + + parser.add_argument("--cal-file", default=None, help="score calibration file") + parser.add_argument("--threshold", default=0, type=float, help="decision threshold") + + RandomAttackFactory.add_class_args(parser, prefix="attacks") + + parser.add_argument("--seg-part-idx", default=1, type=int, help=("test part index")) + parser.add_argument( + "--num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument( + "--output-wav-dir", default=None, help="output path of adv signals" + ) + parser.add_argument( + "--attack-info-file", + default=None, + help="output path of to save information about the generated attacks", + ) + parser.add_argument( + "--random-seed", default=1234, type=int, help="random seed for pytorch" + ) + + parser.add_argument( + "--p-tar-attack", + type=float, + default=1, + help=("probability of generating an attack for a target trial"), + ) + parser.add_argument( + "--p-non-attack", + type=float, + default=1, + help=("probability of generating an attack for a non-target trial"), + ) + parser.add_argument( + "--save-failed", + default=False, + action="store_true", + help=("save failed attacks also"), + ) + + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) diff --git a/hyperion/bin/torch-train-conformer-enc-v1-vq-dvae.py b/hyperion/bin/torch-train-conformer-enc-v1-vq-dvae.py index f1892af0..9adb2cfd 100755 --- a/hyperion/bin/torch-train-conformer-enc-v1-vq-dvae.py +++ b/hyperion/bin/torch-train-conformer-enc-v1-vq-dvae.py @@ -26,51 +26,64 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - in_feats, num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + in_feats, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) - enc_args['out_time_dim'] = -1 - dec_args['out_time_dim'] = -1 + enc_args["out_time_dim"] = -1 + dec_args["out_time_dim"] = -1 encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) @@ -78,54 +91,69 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VQ-VAE with Conformer Enconder as Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VQ-VAE with Conformer Enconder as Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec', in_feats=True) + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec", in_feats=True) VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -136,4 +164,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-conformer-enc-v1-vq-vae.py b/hyperion/bin/torch-train-conformer-enc-v1-vq-vae.py index 468dc8ac..d227a8b2 100755 --- a/hyperion/bin/torch-train-conformer-enc-v1-vq-vae.py +++ b/hyperion/bin/torch-train-conformer-enc-v1-vq-vae.py @@ -26,49 +26,50 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - in_feats, num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, in_feats, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD(data_rspec, train_list, return_class=False, **sd_args) + val_data = SD(data_rspec, val_list, return_class=False, is_val=True, **sd_args) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) - enc_args['out_time_dim'] = -1 - dec_args['out_time_dim'] = -1 + enc_args["out_time_dim"] = -1 + dec_args["out_time_dim"] = -1 encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) @@ -76,53 +77,65 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VQ-VAE with Conformer Encoder as Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VQ-VAE with Conformer Encoder as Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec', in_feats=True) + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec", in_feats=True) VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -133,4 +146,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-dc1d-ae.py b/hyperion/bin/torch-train-dc1d-ae.py index 3cf365c0..e7547927 100755 --- a/hyperion/bin/torch-train-dc1d-ae.py +++ b/hyperion/bin/torch-train-dc1d-ae.py @@ -26,42 +26,58 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_ae(data_rspec, train_list, val_list, exp_path, in_feats, latent_dim, loss, - epochs, num_gpus, log_interval, resume, num_workers, - grad_acc_steps, use_amp, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_ae( + data_rspec, + train_list, + val_list, + exp_path, + in_feats, + latent_dim, + loss, + epochs, + num_gpus, + log_interval, + resume, + num_workers, + grad_acc_steps, + use_amp, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - - logging.info('init datasets') + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, head_channels=latent_dim, **enc_args) decoder = Decoder(latent_dim, head_channels=in_feats, **dec_args) @@ -70,71 +86,99 @@ def train_ae(data_rspec, train_list, val_list, exp_path, in_feats, latent_dim, l optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - losses = { 'mse': nn.MSELoss, 'l1': nn.L1Loss, 'smooth-l1': nn.SmoothL1Loss } - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } + losses = {"mse": nn.MSELoss, "l1": nn.L1Loss, "smooth-l1": nn.SmoothL1Loss} + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} loss = losses[loss]() - trainer = Trainer(model, optimizer, loss, epochs, exp_path, - grad_acc_steps=grad_acc_steps, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), use_amp=use_amp) + trainer = Trainer( + model, + optimizer, + loss, + epochs, + exp_path, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + use_amp=use_amp, + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train AE with Deep Conv1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train AE with Deep Conv1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', dest='data_rspec', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--data-rspec", dest="data_rspec", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - - parser.add_argument('--grad-acc-steps', type=int, default=1, - help='gradient accumulation batches before weigth update') - - parser.add_argument('--epochs', type=int, default=200, - help='number of epochs') - - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') - parser.add_argument('--latent-dim', type=int, required=True, - help='latent representation dimension') - - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') - - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') - - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--log-interval', type=int, default=10, - help='how many batches to wait before logging training status') - - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - - parser.add_argument('--use-amp', action='store_true', default=False, - help='use mixed precision training') - - parser.add_argument('--exp-path', help='experiment path') - - parser.add_argument('--loss', default='mse', choices=['mse', 'l1', 'smooth-l1']) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + + parser.add_argument( + "--grad-acc-steps", + type=int, + default=1, + help="gradient accumulation batches before weigth update", + ) + + parser.add_argument("--epochs", type=int, default=200, help="number of epochs") + + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) + parser.add_argument( + "--latent-dim", type=int, required=True, help="latent representation dimension" + ) + + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") + + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") + + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--log-interval", + type=int, + default=10, + help="how many batches to wait before logging training status", + ) + + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + + parser.add_argument( + "--use-amp", + action="store_true", + default=False, + help="use mixed precision training", + ) + + parser.add_argument("--exp-path", help="experiment path") + + parser.add_argument("--loss", default="mse", choices=["mse", "l1", "smooth-l1"]) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -145,4 +189,3 @@ def train_ae(data_rspec, train_list, val_list, exp_path, in_feats, latent_dim, l del args.seed train_ae(**vars(args)) - diff --git a/hyperion/bin/torch-train-dc1d-dvae.py b/hyperion/bin/torch-train-dc1d-dvae.py index ab4a5fb7..343807c2 100755 --- a/hyperion/bin/torch-train-dc1d-dvae.py +++ b/hyperion/bin/torch-train-dc1d-dvae.py @@ -26,48 +26,61 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - in_feats, num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + in_feats, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) @@ -76,55 +89,67 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VAE with Deep Conv1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VAE with Deep Conv1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -135,4 +160,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-dc1d-vae.py b/hyperion/bin/torch-train-dc1d-vae.py index 20b527cb..daa67b3e 100755 --- a/hyperion/bin/torch-train-dc1d-vae.py +++ b/hyperion/bin/torch-train-dc1d-vae.py @@ -26,45 +26,48 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, in_feats, - num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, in_feats, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) @@ -73,53 +76,65 @@ def train_vae(data_rspec, train_list, val_list, in_feats, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VAE with Deep Conv 1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VAE with Deep Conv 1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', dest='data_rspec', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--data-rspec", dest="data_rspec", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -130,4 +145,3 @@ def train_vae(data_rspec, train_list, val_list, in_feats, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-dc2d-dvae.py b/hyperion/bin/torch-train-dc2d-dvae.py index d0911f7b..2e32b9f9 100755 --- a/hyperion/bin/torch-train-dc2d-dvae.py +++ b/hyperion/bin/torch-train-dc2d-dvae.py @@ -26,48 +26,60 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) @@ -76,53 +88,64 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VAE with Deep Convolutional 2d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VAE with Deep Convolutional 2d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -133,4 +156,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-dc2d-vae.py b/hyperion/bin/torch-train-dc2d-vae.py index 59e04bf5..d8675ae9 100755 --- a/hyperion/bin/torch-train-dc2d-vae.py +++ b/hyperion/bin/torch-train-dc2d-vae.py @@ -26,47 +26,48 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD(data_rspec, train_list, return_class=False, **sd_args) + val_data = SD(data_rspec, val_list, return_class=False, is_val=True, **sd_args) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) @@ -75,51 +76,62 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains VAE with Deep Convolutional 2d Encoder-Decoders') + fromfile_prefix_chars="@", + description="Trains VAE with Deep Convolutional 2d Encoder-Decoders", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -130,4 +142,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-dvae.py b/hyperion/bin/torch-train-dvae.py index 6179b911..2f1fcbe9 100755 --- a/hyperion/bin/torch-train-dvae.py +++ b/hyperion/bin/torch-train-dvae.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -30,86 +35,97 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler enc_dict = { - 'dc1d': DC1dEncoder, - 'dc2d': DC2dEncoder, - 'resnet1d': ResNet1dEncoder, - 'resnet2d': ResNet2dEncoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } + "dc1d": DC1dEncoder, + "dc2d": DC2dEncoder, + "resnet1d": ResNet1dEncoder, + "resnet2d": ResNet2dEncoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} dec_dict = { - 'dc1d': DC1dDecoder, - 'dc2d': DC2dDecoder, - 'resnet1d': ResNet1dDecoder, - 'resnet2d': ResNet2dDecoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } - - -def init_data(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - num_workers, num_gpus, rank, **kwargs): + "dc1d": DC1dDecoder, + "dc2d": DC2dDecoder, + "resnet1d": ResNet1dDecoder, + "resnet2d": ResNet2dDecoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} + + +def init_data( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + num_workers, + num_gpus, + rank, + **kwargs +): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, train_pair_list, **sd_args) val_data = SD(data_rspec, val_list, val_pair_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_model(rank, **kwargs): - Encoder = kwargs['enc_class'] - Decoder = kwargs['dec_class'] - enc_args = Encoder.filter_args(**kwargs['enc']) - dec_args = Decoder.filter_args(**kwargs['dec']) + Encoder = kwargs["enc_class"] + Decoder = kwargs["dec_class"] + enc_args = Encoder.filter_args(**kwargs["enc"]) + dec_args = Decoder.filter_args(**kwargs["dec"]) vae_args = VAE.filter_args(**kwargs) # add some extra arguments if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['in_feats'] = kwargs['in_feats'] + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + enc_args["in_feats"] = kwargs["in_feats"] - if Encoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['out_time_dim'] = -1 + if Encoder in (TransformerEncoderV1, ConformerEncoderV1): + enc_args["out_time_dim"] = -1 - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['out_time_dim'] = -1 + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["out_time_dim"] = -1 if rank == 0: - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) if rank == 0: - logging.info('vae-model={}'.format(model)) + logging.info("vae-model={}".format(model)) return model @@ -120,21 +136,21 @@ def train_vae(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_model(**kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -142,8 +158,7 @@ def train_vae(gpu_id, args): ddp.ddp_cleanup() - -# (data_rspec, train_list, val_list, +# (data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -168,9 +183,9 @@ def train_vae(gpu_id, args): # logging.info('trainer args={}'.format(trn_args)) # logging.info('init datasets') -# train_data = SD(data_rspec, train_list, +# train_data = SD(data_rspec, train_list, # return_class=False, **sd_args) -# val_data = SD(data_rspec, val_list, +# val_data = SD(data_rspec, val_list, # return_class=False, is_val=True, **sd_args) # logging.info('init samplers') @@ -201,40 +216,42 @@ def train_vae(gpu_id, args): # trainer.load_last_checkpoint() # trainer.fit(train_loader, test_loader) + def make_parser(enc_class, dec_class): Encoder = enc_dict[enc_class] Decoder = dec_dict[dec_class] parser = ArgumentParser() - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_class_args(parser, prefix='enc') + Encoder.add_class_args(parser, prefix="enc") dec_args = {} - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['in_feats'] = True - Decoder.add_class_args(parser, prefix='dec', **dec_args) + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["in_feats"] = True + Decoder.add_class_args(parser, prefix="dec", **dec_args) VAE.add_class_args(parser) Trainer.add_class_args(parser) @@ -242,28 +259,31 @@ def make_parser(enc_class, dec_class): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train Denoising VAE') + parser = ArgumentParser(description="Train Denoising VAE") - parser.add_argument('--local_rank', default=0, type=int) - parser.add_argument('--cfg', action=ActionConfigFile) + parser.add_argument("--local_rank", default=0, type=int) + parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() for ke, ve in enc_dict.items(): for kd, vd in dec_dict.items(): - k = '%s:%s' % (ke, kd) + k = "%s:%s" % (ke, kd) parser_k = make_parser(ke, kd) subcommands.add_subcommand(k, parser_k) @@ -276,20 +296,18 @@ def make_parser(enc_class, dec_class): if gpu_id == 0: try: - config_file = Path(args_sc.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args_sc.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass - ed = vae_type.split(':') + ed = vae_type.split(":") args_sc.enc_class = enc_dict[ed[0]] args_sc.dec_class = dec_dict[ed[1]] # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_vae(gpu_id, args_sc) - - # parser.add_argument('--local_rank', default=0, type=int) # args = parser.parse_args() diff --git a/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py b/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py index 40e83f6e..2e14a0d0 100755 --- a/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py +++ b/hyperion/bin/torch-train-efficientnet-xvec-from-wav.py @@ -5,10 +5,15 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,83 +31,100 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def train_xvec(gpu_id, args): - + config_logger(args.verbose) del args.verbose logging.debug(args) kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -110,38 +132,44 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Train XVector with ResNet encoder from audio files') + description="Train XVector with ResNet encoder from audio files" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_class_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -149,13 +177,11 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - - diff --git a/hyperion/bin/torch-train-efficientnet-xvec.py b/hyperion/bin/torch-train-efficientnet-xvec.py index 6b93f695..801b3afb 100755 --- a/hyperion/bin/torch-train-efficientnet-xvec.py +++ b/hyperion/bin/torch-train-efficientnet-xvec.py @@ -7,10 +7,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,30 +31,34 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): + +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader @@ -57,11 +66,11 @@ def init_data(data_rspec, train_list, val_list, def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model @@ -72,21 +81,21 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -94,34 +103,38 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train XVector with ResNet encoder') + parser = ArgumentParser(description="Train XVector with ResNet encoder") - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -129,13 +142,13 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) @@ -165,7 +178,7 @@ def train_xvec(gpu_id, args): # from hyperion.torch.data import ClassWeightedSeqSampler as Sampler # from hyperion.torch.metrics import CategoricalAccuracy -# def train_xvec(data_rspec, train_list, val_list, +# def train_xvec(data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -208,8 +221,8 @@ def train_xvec(gpu_id, args): # optimizer = OF.create(model.parameters(), **opt_args) # lr_sch = LRSF.create(optimizer, **lrsch_args) # metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, optimizer, + +# trainer = Trainer(model, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, # data_parallel=(num_gpus>1), **trn_args) # if resume: @@ -217,7 +230,6 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) - # if __name__ == '__main__': # parser = argparse.ArgumentParser( @@ -236,15 +248,15 @@ def train_xvec(gpu_id, args): # LRSF.add_argparse_args(parser, prefix='lrsch') # Trainer.add_argparse_args(parser) -# parser.add_argument('--num-workers', type=int, default=5, +# parser.add_argument('--num-workers', type=int, default=5, # help='num_workers of data loader') # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') -# parser.add_argument('--seed', type=int, default=1123581321, +# parser.add_argument('--seed', type=int, default=1123581321, # help='random seed (default: 1)') # parser.add_argument('--resume', action='store_true', default=False, # help='resume training from checkpoint') -# parser.add_argument('-v', '--verbose', dest='verbose', default=1, +# parser.add_argument('-v', '--verbose', dest='verbose', default=1, # choices=[0, 1, 2, 3], type=int) # args = parser.parse_args() @@ -256,4 +268,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet-xvec-from-wav.py b/hyperion/bin/torch-train-resnet-xvec-from-wav.py index 4788b92a..1b0b6f57 100755 --- a/hyperion/bin/torch-train-resnet-xvec-from-wav.py +++ b/hyperion/bin/torch-train-resnet-xvec-from-wav.py @@ -6,20 +6,27 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np import torch import torch.nn as nn + # import torch.multiprocessing as mp from hyperion.hyp_defs import config_logger, set_float_cpu from hyperion.torch.utils import open_device from hyperion.torch.utils import ddp + # from hyperion.torch.helpers import OptimizerFactory as OF # from hyperion.torch.lr_schedulers import LRSchedulerFactory as LRSF from hyperion.torch.trainers import XVectorTrainerFromWav as Trainer @@ -28,8 +35,8 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy -#from hyperion.torch.layers import AudioFeatsFactory as AFF -#from hyperion.torch.layers import MeanVarianceNorm as MVN +# from hyperion.torch.layers import AudioFeatsFactory as AFF +# from hyperion.torch.layers import MeanVarianceNorm as MVN from hyperion.torch.narchs import AudioFeatsMVN as AF # from torch.utils.data import dataloader @@ -64,7 +71,6 @@ # return f - # def init_device(num_gpus): # set_float_cpu('float32') # logging.info('initializing devices num_gpus={}'.format(num_gpus)) @@ -72,33 +78,45 @@ # return device -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader @@ -122,24 +140,24 @@ def init_data(audio_path, train_list, val_list, def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model @@ -158,34 +176,34 @@ def init_xvector(num_classes, rank, **kwargs): def train_xvec(gpu_id, args): - + config_logger(args.verbose) del args.verbose logging.debug(args) kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - #use_gpu = ddp_args['num_gpus'] > 0 - #kwargs['use_gpu'] = use_gpu - kwargs['rank'] = rank + # use_gpu = ddp_args['num_gpus'] > 0 + # kwargs['use_gpu'] = use_gpu + kwargs["rank"] = rank # train_loader, test_loader = init_data( - # args.audio_path, args.train_list, args.val_list, - # args.train_aug_cfg, args.val_aug_cfg, args.num_workers, + # args.audio_path, args.train_list, args.val_list, + # args.train_aug_cfg, args.val_aug_cfg, args.num_workers, # use_gpu, **kwargs) train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) - #model.to(device) - #optimizer, lr_sch = init_opt(model, **kwargs) + # model.to(device) + # optimizer, lr_sch = init_opt(model, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) + logging.info("trainer args={}".format(trn_args)) # total_params = 0 # total_endpoints = 0 @@ -199,13 +217,18 @@ def train_xvec(gpu_id, args): # logging.info(f"Total Trainable Params: {total_params}") # logging.info(f"Total Trainable Endpoint Params: {total_endpoints}") - metrics = { 'acc': CategoricalAccuracy() } - # trainer = Trainer(model, feat_extractor, optimizer, + metrics = {"acc": CategoricalAccuracy()} + # trainer = Trainer(model, feat_extractor, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, # ddp=world_size>1, **trn_args) - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -213,26 +236,28 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Train XVector with ResNet encoder from audio files') + description="Train XVector with ResNet encoder from audio files" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_class_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") # feats_parser = ArgumentParser() # AFF.add_class_args(feats_parser, prefix='audio_feats') # MVN.add_class_args(feats_parser, prefix='mvn') @@ -244,30 +269,33 @@ def train_xvec(gpu_id, args): Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() - #device = init_device(args.num_gpus) - #mp.spawn(train_xvec, nprocs=args.num_gpus, args=(args,)) + # device = init_device(args.num_gpus) + # mp.spawn(train_xvec, nprocs=args.num_gpus, args=(args,)) gpu_id = args.local_rank del args.local_rank if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - diff --git a/hyperion/bin/torch-train-resnet1d-dvae.py b/hyperion/bin/torch-train-resnet1d-dvae.py index 009774ae..420cf7b2 100755 --- a/hyperion/bin/torch-train-resnet1d-dvae.py +++ b/hyperion/bin/torch-train-resnet1d-dvae.py @@ -26,48 +26,61 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - in_feats, num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + in_feats, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) @@ -76,55 +89,67 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VAE with ResNet1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VAE with ResNet1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -135,4 +160,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet1d-vae.py b/hyperion/bin/torch-train-resnet1d-vae.py index 6c272166..a8edb3c3 100755 --- a/hyperion/bin/torch-train-resnet1d-vae.py +++ b/hyperion/bin/torch-train-resnet1d-vae.py @@ -26,45 +26,48 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, in_feats, - num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, in_feats, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) @@ -73,53 +76,65 @@ def train_vae(data_rspec, train_list, val_list, in_feats, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VAE with ResNet1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VAE with ResNet1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', dest='data_rspec', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--data-rspec", dest="data_rspec", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -130,4 +145,3 @@ def train_vae(data_rspec, train_list, val_list, in_feats, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet1d-vq-dvae.py b/hyperion/bin/torch-train-resnet1d-vq-dvae.py index aa517330..9571eff8 100755 --- a/hyperion/bin/torch-train-resnet1d-vq-dvae.py +++ b/hyperion/bin/torch-train-resnet1d-vq-dvae.py @@ -26,48 +26,61 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - in_feats, num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + in_feats, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) @@ -76,55 +89,69 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VQ-VAE with ResNet1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VQ-VAE with ResNet1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -135,4 +162,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet1d-vq-vae.py b/hyperion/bin/torch-train-resnet1d-vq-vae.py index 0249d5b8..373be8f3 100755 --- a/hyperion/bin/torch-train-resnet1d-vq-vae.py +++ b/hyperion/bin/torch-train-resnet1d-vq-vae.py @@ -26,46 +26,47 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - in_feats, num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, in_feats, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD(data_rspec, train_list, return_class=False, **sd_args) + val_data = SD(data_rspec, val_list, return_class=False, is_val=True, **sd_args) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) @@ -74,52 +75,65 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VQ-VAE with ResNet1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VQ-VAE with ResNet1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -130,4 +144,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet2d-dvae.py b/hyperion/bin/torch-train-resnet2d-dvae.py index 3f7e26ed..6845750f 100755 --- a/hyperion/bin/torch-train-resnet2d-dvae.py +++ b/hyperion/bin/torch-train-resnet2d-dvae.py @@ -26,48 +26,60 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) @@ -76,52 +88,66 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VAE with ResNet2d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VAE with ResNet2d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -132,4 +158,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet2d-vae.py b/hyperion/bin/torch-train-resnet2d-vae.py index 8b11d57f..575c5575 100755 --- a/hyperion/bin/torch-train-resnet2d-vae.py +++ b/hyperion/bin/torch-train-resnet2d-vae.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -25,48 +30,52 @@ from hyperion.torch.data import FeatSeqDataset as SD from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): + +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_model(rank, **kwargs): - enc_args = Encoder.filter_args(**kwargs['enc']) - dec_args = Decoder.filter_args(**kwargs['dec']) + enc_args = Encoder.filter_args(**kwargs["enc"]) + dec_args = Decoder.filter_args(**kwargs["dec"]) vae_args = VAE.filter_args(**kwargs) if rank == 0: - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) if rank == 0: - logging.info('vae-model={}'.format(model)) + logging.info("vae-model={}".format(model)) return model @@ -77,21 +86,21 @@ def train_vae(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_model(**kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -99,8 +108,7 @@ def train_vae(gpu_id, args): ddp.ddp_cleanup() - -# (data_rspec, train_list, val_list, +# (data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -125,9 +133,9 @@ def train_vae(gpu_id, args): # logging.info('trainer args={}'.format(trn_args)) # logging.info('init datasets') -# train_data = SD(data_rspec, train_list, +# train_data = SD(data_rspec, train_list, # return_class=False, **sd_args) -# val_data = SD(data_rspec, val_list, +# val_data = SD(data_rspec, val_list, # return_class=False, is_val=True, **sd_args) # logging.info('init samplers') @@ -159,25 +167,24 @@ def train_vae(gpu_id, args): # trainer.fit(train_loader, test_loader) +if __name__ == "__main__": -if __name__ == '__main__': + parser = ArgumentParser(description="Train VAE with ResNet2d Encoder-Decoder") - parser = ArgumentParser( - description='Train VAE with ResNet2d Encoder-Decoder') - - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - Encoder.add_class_args(parser, prefix='enc') - Decoder.add_class_args(parser, prefix='dec') + Encoder.add_class_args(parser, prefix="enc") + Decoder.add_class_args(parser, prefix="dec") VAE.add_class_args(parser) Trainer.add_class_args(parser) @@ -185,13 +192,17 @@ def train_vae(gpu_id, args): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -199,13 +210,13 @@ def train_vae(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_vae(gpu_id, args) # config_logger(args.verbose) @@ -216,4 +227,3 @@ def train_vae(gpu_id, args): # del args.seed # train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet2d-vq-dvae.py b/hyperion/bin/torch-train-resnet2d-vq-dvae.py index 2b1aa757..95eb3923 100755 --- a/hyperion/bin/torch-train-resnet2d-vq-dvae.py +++ b/hyperion/bin/torch-train-resnet2d-vq-dvae.py @@ -26,48 +26,60 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) @@ -76,53 +88,64 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VQ-VAE with ResNet2d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VQ-VAE with ResNet2d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -133,4 +156,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-resnet2d-vq-vae.py b/hyperion/bin/torch-train-resnet2d-vq-vae.py index 24bc9d77..07f25d5f 100755 --- a/hyperion/bin/torch-train-resnet2d-vq-vae.py +++ b/hyperion/bin/torch-train-resnet2d-vq-vae.py @@ -26,47 +26,48 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD(data_rspec, train_list, return_class=False, **sd_args) + val_data = SD(data_rspec, val_list, return_class=False, is_val=True, **sd_args) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus > 0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) @@ -75,49 +76,61 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VQ-VAE with ResNet2d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VQ-VAE with ResNet2d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec') + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec") VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') - - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") + + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -128,4 +141,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py b/hyperion/bin/torch-train-spinenet-xvec-from-wav.py index b90334a0..3583f70c 100755 --- a/hyperion/bin/torch-train-spinenet-xvec-from-wav.py +++ b/hyperion/bin/torch-train-spinenet-xvec-from-wav.py @@ -6,10 +6,15 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -27,83 +32,100 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def train_xvec(gpu_id, args): - + config_logger(args.verbose) del args.verbose logging.debug(args) kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -111,38 +133,44 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Train XVector with ResNet encoder from audio files') + description="Train XVector with ResNet encoder from audio files" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_class_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -150,17 +178,16 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - # import sys # import os # import argparse @@ -202,7 +229,7 @@ def train_xvec(gpu_id, args): # return f -# def train_xvec(audio_path, train_list, val_list, +# def train_xvec(audio_path, train_list, val_list, # train_aug_cfg, val_aug_cfg, # num_gpus, resume, num_workers, **kwargs): @@ -273,8 +300,8 @@ def train_xvec(gpu_id, args): # optimizer = OF.create(model.parameters(), **opt_args) # lr_sch = LRSF.create(optimizer, **lrsch_args) # metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, feat_extractor, optimizer, + +# trainer = Trainer(model, feat_extractor, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, # data_parallel=(num_gpus>1), **trn_args) # if resume: @@ -282,7 +309,6 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) - # if __name__ == '__main__': # parser = argparse.ArgumentParser( @@ -300,7 +326,7 @@ def train_xvec(gpu_id, args): # parser.add_argument('--train-aug-cfg', default=None) # parser.add_argument('--val-aug-cfg', default=None) -# parser.add_argument('--num-workers', type=int, default=5, +# parser.add_argument('--num-workers', type=int, default=5, # help='num_workers of data loader') # AFF.add_argparse_args(parser, prefix='feats') @@ -313,11 +339,11 @@ def train_xvec(gpu_id, args): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') -# parser.add_argument('--seed', type=int, default=1123581321, +# parser.add_argument('--seed', type=int, default=1123581321, # help='random seed') # parser.add_argument('--resume', action='store_true', default=False, # help='resume training from checkpoint') -# parser.add_argument('-v', '--verbose', dest='verbose', default=1, +# parser.add_argument('-v', '--verbose', dest='verbose', default=1, # choices=[0, 1, 2, 3], type=int) # args = parser.parse_args() @@ -329,5 +355,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - - diff --git a/hyperion/bin/torch-train-tdnn-xvec-from-wav.py b/hyperion/bin/torch-train-tdnn-xvec-from-wav.py index 74178f13..beb2e6f7 100755 --- a/hyperion/bin/torch-train-tdnn-xvec-from-wav.py +++ b/hyperion/bin/torch-train-tdnn-xvec-from-wav.py @@ -5,10 +5,15 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,83 +31,100 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def train_xvec(gpu_id, args): - + config_logger(args.verbose) del args.verbose logging.debug(args) kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -110,38 +132,44 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Train XVector with ResNet encoder from audio files') + description="Train XVector with ResNet encoder from audio files" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_class_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -149,12 +177,11 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - diff --git a/hyperion/bin/torch-train-tdnn-xvec.py b/hyperion/bin/torch-train-tdnn-xvec.py index f77189b2..3fb691b1 100755 --- a/hyperion/bin/torch-train-tdnn-xvec.py +++ b/hyperion/bin/torch-train-tdnn-xvec.py @@ -7,10 +7,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,30 +31,34 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): + +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader @@ -57,11 +66,11 @@ def init_data(data_rspec, train_list, val_list, def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model @@ -72,21 +81,21 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -94,34 +103,38 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train XVector with ResNet encoder') + parser = ArgumentParser(description="Train XVector with ResNet encoder") - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -129,17 +142,16 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - # #!/usr/bin/env python # """ # Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) @@ -166,7 +178,7 @@ def train_xvec(gpu_id, args): # from hyperion.torch.data import ClassWeightedSeqSampler as Sampler # from hyperion.torch.metrics import CategoricalAccuracy -# def train_xvec(data_rspec, train_list, val_list, +# def train_xvec(data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -209,8 +221,8 @@ def train_xvec(gpu_id, args): # optimizer = OF.create(model.parameters(), **opt_args) # lr_sch = LRSF.create(optimizer, **lrsch_args) # metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, optimizer, + +# trainer = Trainer(model, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, # data_parallel=(num_gpus>1), **trn_args) # if resume: @@ -218,7 +230,6 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) - # if __name__ == '__main__': # parser = argparse.ArgumentParser( @@ -237,11 +248,11 @@ def train_xvec(gpu_id, args): # LRSF.add_argparse_args(parser, prefix='lrsch') # Trainer.add_argparse_args(parser) -# parser.add_argument('--num-workers', type=int, default=5, +# parser.add_argument('--num-workers', type=int, default=5, # help='num_workers of data loader') # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') -# parser.add_argument('--seed', type=int, default=1, +# parser.add_argument('--seed', type=int, default=1, # help='random seed (default: 1)') # parser.add_argument('--resume', action='store_true', default=False, # help='resume training from checkpoint') @@ -256,4 +267,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-train-transformer-enc-v1-dvae.py b/hyperion/bin/torch-train-transformer-enc-v1-dvae.py index bffcb379..39ee2974 100755 --- a/hyperion/bin/torch-train-transformer-enc-v1-dvae.py +++ b/hyperion/bin/torch-train-transformer-enc-v1-dvae.py @@ -27,51 +27,64 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - in_feats, num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + in_feats, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) - enc_args['out_time_dim'] = -1 - dec_args['out_time_dim'] = -1 + enc_args["out_time_dim"] = -1 + dec_args["out_time_dim"] = -1 encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) @@ -79,54 +92,67 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VAE with ResNet1d Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VAE with ResNet1d Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec', in_feats=True) + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec", in_feats=True) VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', - default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -137,4 +163,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-transformer-enc-v1-vae.py b/hyperion/bin/torch-train-transformer-enc-v1-vae.py index bd5f7d25..9f5cbdf8 100755 --- a/hyperion/bin/torch-train-transformer-enc-v1-vae.py +++ b/hyperion/bin/torch-train-transformer-enc-v1-vae.py @@ -26,48 +26,51 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, in_feats, - num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, in_feats, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) - enc_args['out_time_dim'] = -1 - dec_args['out_time_dim'] = -1 + enc_args["out_time_dim"] = -1 + dec_args["out_time_dim"] = -1 encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) @@ -75,53 +78,65 @@ def train_vae(data_rspec, train_list, val_list, in_feats, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VAE with Transformer Encoder as Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VAE with Transformer Encoder as Encoder-Decoder", + ) - parser.add_argument('--data-rspec', dest='data_rspec', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', required=True) + parser.add_argument("--data-rspec", dest="data_rspec", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec', in_feats=True) + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec", in_feats=True) VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -132,4 +147,3 @@ def train_vae(data_rspec, train_list, val_list, in_feats, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-transformer-enc-v1-vq-dvae.py b/hyperion/bin/torch-train-transformer-enc-v1-vq-dvae.py index 2547481d..c6246fe3 100755 --- a/hyperion/bin/torch-train-transformer-enc-v1-vq-dvae.py +++ b/hyperion/bin/torch-train-transformer-enc-v1-vq-dvae.py @@ -26,51 +26,64 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - in_feats, num_gpus, resume, num_workers, **kwargs): - - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) +def train_vae( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + in_feats, + num_gpus, + resume, + num_workers, + **kwargs +): + + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, train_pair_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, val_pair_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD( + data_rspec, train_list, train_pair_list, return_class=False, **sd_args + ) + val_data = SD( + data_rspec, val_list, val_pair_list, return_class=False, is_val=True, **sd_args + ) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) - enc_args['out_time_dim'] = -1 - dec_args['out_time_dim'] = -1 + enc_args["out_time_dim"] = -1 + dec_args["out_time_dim"] = -1 encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) @@ -78,56 +91,70 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Denoising VQ-VAE with Transformer Enconder as Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train Denoising VQ-VAE with Transformer Enconder as Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec', in_feats=True) + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec", in_feats=True) VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed (default: 1)') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument( + "--seed", type=int, default=1123581321, help="random seed (default: 1)" + ) + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -138,4 +165,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-transformer-enc-v1-vq-vae.py b/hyperion/bin/torch-train-transformer-enc-v1-vq-vae.py index 4e055532..4659e0d8 100755 --- a/hyperion/bin/torch-train-transformer-enc-v1-vq-vae.py +++ b/hyperion/bin/torch-train-transformer-enc-v1-vq-vae.py @@ -26,49 +26,50 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler -def train_vae(data_rspec, train_list, val_list, - in_feats, num_gpus, resume, num_workers, **kwargs): +def train_vae( + data_rspec, train_list, val_list, in_feats, num_gpus, resume, num_workers, **kwargs +): - set_float_cpu('float32') - logging.info('initializing devices num_gpus={}'.format(num_gpus)) + set_float_cpu("float32") + logging.info("initializing devices num_gpus={}".format(num_gpus)) device = open_device(num_gpus=num_gpus) sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) - enc_args = Encoder.filter_args(prefix='enc', **kwargs) - dec_args = Decoder.filter_args(prefix='dec', **kwargs) + enc_args = Encoder.filter_args(prefix="enc", **kwargs) + dec_args = Decoder.filter_args(prefix="dec", **kwargs) vae_args = VAE.filter_args(**kwargs) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) trn_args = Trainer.filter_args(**kwargs) - logging.info('seq dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('optimizer args={}'.format(opt_args)) - logging.info('lr scheduler args={}'.format(lrsch_args)) - logging.info('trainer args={}'.format(trn_args)) - - logging.info('init datasets') - train_data = SD(data_rspec, train_list, - return_class=False, **sd_args) - val_data = SD(data_rspec, val_list, - return_class=False, is_val=True, **sd_args) - - logging.info('init samplers') + logging.info("seq dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("optimizer args={}".format(opt_args)) + logging.info("lr scheduler args={}".format(lrsch_args)) + logging.info("trainer args={}".format(trn_args)) + + logging.info("init datasets") + train_data = SD(data_rspec, train_list, return_class=False, **sd_args) + val_data = SD(data_rspec, val_list, return_class=False, is_val=True, **sd_args) + + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) - largs = {'num_workers': num_workers, 'pin_memory': True} if num_gpus>0 else {} + largs = {"num_workers": num_workers, "pin_memory": True} if num_gpus > 0 else {} train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) - enc_args['out_time_dim'] = -1 - dec_args['out_time_dim'] = -1 + enc_args["out_time_dim"] = -1 + dec_args["out_time_dim"] = -1 encoder = Encoder(in_feats, **enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) @@ -76,52 +77,65 @@ def train_vae(data_rspec, train_list, val_list, optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - - trainer = Trainer(model, optimizer, - device=device, metrics=metrics, lr_scheduler=lr_sch, - data_parallel=(num_gpus>1), **trn_args) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + + trainer = Trainer( + model, + optimizer, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + data_parallel=(num_gpus > 1), + **trn_args + ) if resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) - -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train VQ-VAE with Transformer Encoder as Encoder-Decoder') + fromfile_prefix_chars="@", + description="Train VQ-VAE with Transformer Encoder as Encoder-Decoder", + ) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_argparse_args(parser, prefix='enc') - Decoder.add_argparse_args(parser, prefix='dec', in_feats=True) + Encoder.add_argparse_args(parser, prefix="enc") + Decoder.add_argparse_args(parser, prefix="dec", in_feats=True) VAE.add_argparse_args(parser) - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") Trainer.add_argparse_args(parser) - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) @@ -132,4 +146,3 @@ def train_vae(data_rspec, train_list, val_list, del args.seed train_vae(**vars(args)) - diff --git a/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py b/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py index 605bd916..4aac039c 100755 --- a/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py +++ b/hyperion/bin/torch-train-transformer-xvec-v1-from-wav.py @@ -5,10 +5,15 @@ """ import sys import os -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,83 +31,100 @@ from hyperion.torch.narchs import AudioFeatsMVN as AF -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def train_xvec(gpu_id, args): - + config_logger(args.verbose) del args.verbose logging.debug(args) kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -110,38 +132,44 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": parser = ArgumentParser( - description='Train XVector with ResNet encoder from audio files') + description="Train XVector with ResNet encoder from audio files" + ) - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_class_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -149,12 +177,11 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) - diff --git a/hyperion/bin/torch-train-transformer-xvec-v1.py b/hyperion/bin/torch-train-transformer-xvec-v1.py index 8a5e9b46..1dfa9ed5 100755 --- a/hyperion/bin/torch-train-transformer-xvec-v1.py +++ b/hyperion/bin/torch-train-transformer-xvec-v1.py @@ -7,10 +7,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -26,30 +31,34 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler from hyperion.torch.metrics import CategoricalAccuracy -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): + +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader @@ -57,11 +66,11 @@ def init_data(data_rspec, train_list, val_list, def init_xvector(num_classes, rank, **kwargs): xvec_args = XVec.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = XVec(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model @@ -72,21 +81,21 @@ def train_xvec(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_xvector(train_loader.dataset.num_classes, **kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -94,34 +103,38 @@ def train_xvec(gpu_id, args): ddp.ddp_cleanup() -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train XVector with ResNet encoder') + parser = ArgumentParser(description="Train XVector with ResNet encoder") - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) XVec.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - parser.add_argument('--local_rank', default=0, type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + parser.add_argument("--local_rank", default=0, type=int) args = parser.parse_args() gpu_id = args.local_rank @@ -129,13 +142,13 @@ def train_xvec(gpu_id, args): if gpu_id == 0: try: - config_file = Path(args.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args) @@ -164,7 +177,7 @@ def train_xvec(gpu_id, args): # from hyperion.torch.data import ClassWeightedSeqSampler as Sampler # from hyperion.torch.metrics import CategoricalAccuracy -# def train_xvec(data_rspec, train_list, val_list, +# def train_xvec(data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -207,8 +220,8 @@ def train_xvec(gpu_id, args): # optimizer = OF.create(model.parameters(), **opt_args) # lr_sch = LRSF.create(optimizer, **lrsch_args) # metrics = { 'acc': CategoricalAccuracy() } - -# trainer = Trainer(model, optimizer, + +# trainer = Trainer(model, optimizer, # device=device, metrics=metrics, lr_scheduler=lr_sch, # data_parallel=(num_gpus>1), **trn_args) @@ -217,7 +230,6 @@ def train_xvec(gpu_id, args): # trainer.fit(train_loader, test_loader) - # if __name__ == '__main__': # parser = argparse.ArgumentParser( @@ -235,15 +247,15 @@ def train_xvec(gpu_id, args): # OF.add_argparse_args(parser, prefix='opt') # LRSF.add_argparse_args(parser, prefix='lrsch') -# parser.add_argument('--num-workers', type=int, default=5, +# parser.add_argument('--num-workers', type=int, default=5, # help='num_workers of data loader') # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') -# parser.add_argument('--seed', type=int, default=1123581321, +# parser.add_argument('--seed', type=int, default=1123581321, # help='random seed (default: 1)') # parser.add_argument('--resume', action='store_true', default=False, # help='resume training from checkpoint') -# parser.add_argument('-v', '--verbose', dest='verbose', default=1, +# parser.add_argument('-v', '--verbose', dest='verbose', default=1, # choices=[0, 1, 2, 3], type=int) # args = parser.parse_args() @@ -255,4 +267,3 @@ def train_xvec(gpu_id, args): # del args.seed # train_xvec(**vars(args)) - diff --git a/hyperion/bin/torch-train-vae.py b/hyperion/bin/torch-train-vae.py index e8a16acc..29a6f7cd 100755 --- a/hyperion/bin/torch-train-vae.py +++ b/hyperion/bin/torch-train-vae.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -30,85 +35,87 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler enc_dict = { - 'dc1d': DC1dEncoder, - 'dc2d': DC2dEncoder, - 'resnet1d': ResNet1dEncoder, - 'resnet2d': ResNet2dEncoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } + "dc1d": DC1dEncoder, + "dc2d": DC2dEncoder, + "resnet1d": ResNet1dEncoder, + "resnet2d": ResNet2dEncoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} dec_dict = { - 'dc1d': DC1dDecoder, - 'dc2d': DC2dDecoder, - 'resnet1d': ResNet1dDecoder, - 'resnet2d': ResNet2dDecoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } + "dc1d": DC1dDecoder, + "dc2d": DC2dDecoder, + "resnet1d": ResNet1dDecoder, + "resnet2d": ResNet2dDecoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_model(rank, **kwargs): - Encoder = kwargs['enc_class'] - Decoder = kwargs['dec_class'] - enc_args = Encoder.filter_args(**kwargs['enc']) - dec_args = Decoder.filter_args(**kwargs['dec']) + Encoder = kwargs["enc_class"] + Decoder = kwargs["dec_class"] + enc_args = Encoder.filter_args(**kwargs["enc"]) + dec_args = Decoder.filter_args(**kwargs["dec"]) vae_args = VAE.filter_args(**kwargs) # add some extra arguments if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['in_feats'] = kwargs['in_feats'] + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + enc_args["in_feats"] = kwargs["in_feats"] - if Encoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['out_time_dim'] = -1 + if Encoder in (TransformerEncoderV1, ConformerEncoderV1): + enc_args["out_time_dim"] = -1 - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['out_time_dim'] = -1 + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["out_time_dim"] = -1 if rank == 0: - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) if rank == 0: - logging.info('vae-model={}'.format(model)) + logging.info("vae-model={}".format(model)) return model @@ -119,21 +126,21 @@ def train_vae(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_model(**kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -141,8 +148,7 @@ def train_vae(gpu_id, args): ddp.ddp_cleanup() - -# (data_rspec, train_list, val_list, +# (data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -167,9 +173,9 @@ def train_vae(gpu_id, args): # logging.info('trainer args={}'.format(trn_args)) # logging.info('init datasets') -# train_data = SD(data_rspec, train_list, +# train_data = SD(data_rspec, train_list, # return_class=False, **sd_args) -# val_data = SD(data_rspec, val_list, +# val_data = SD(data_rspec, val_list, # return_class=False, is_val=True, **sd_args) # logging.info('init samplers') @@ -200,38 +206,40 @@ def train_vae(gpu_id, args): # trainer.load_last_checkpoint() # trainer.fit(train_loader, test_loader) + def make_parser(enc_class, dec_class): Encoder = enc_dict[enc_class] Decoder = dec_dict[dec_class] parser = ArgumentParser() - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_class_args(parser, prefix='enc') + Encoder.add_class_args(parser, prefix="enc") dec_args = {} - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['in_feats'] = True - Decoder.add_class_args(parser, prefix='dec', **dec_args) + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["in_feats"] = True + Decoder.add_class_args(parser, prefix="dec", **dec_args) VAE.add_class_args(parser) Trainer.add_class_args(parser) @@ -239,28 +247,31 @@ def make_parser(enc_class, dec_class): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train VAE') + parser = ArgumentParser(description="Train VAE") - parser.add_argument('--local_rank', default=0, type=int) - parser.add_argument('--cfg', action=ActionConfigFile) + parser.add_argument("--local_rank", default=0, type=int) + parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() for ke, ve in enc_dict.items(): for kd, vd in dec_dict.items(): - k = '%s:%s' % (ke, kd) + k = "%s:%s" % (ke, kd) parser_k = make_parser(ke, kd) subcommands.add_subcommand(k, parser_k) @@ -273,20 +284,18 @@ def make_parser(enc_class, dec_class): if gpu_id == 0: try: - config_file = Path(args_sc.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args_sc.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass - ed = vae_type.split(':') + ed = vae_type.split(":") args_sc.enc_class = enc_dict[ed[0]] args_sc.dec_class = dec_dict[ed[1]] # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_vae(gpu_id, args_sc) - - # parser.add_argument('--local_rank', default=0, type=int) # args = parser.parse_args() diff --git a/hyperion/bin/torch-train-vq-dvae.py b/hyperion/bin/torch-train-vq-dvae.py index d4055bff..38bcee0b 100755 --- a/hyperion/bin/torch-train-vq-dvae.py +++ b/hyperion/bin/torch-train-vq-dvae.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -30,86 +35,97 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler enc_dict = { - 'dc1d': DC1dEncoder, - 'dc2d': DC2dEncoder, - 'resnet1d': ResNet1dEncoder, - 'resnet2d': ResNet2dEncoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } + "dc1d": DC1dEncoder, + "dc2d": DC2dEncoder, + "resnet1d": ResNet1dEncoder, + "resnet2d": ResNet2dEncoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} dec_dict = { - 'dc1d': DC1dDecoder, - 'dc2d': DC2dDecoder, - 'resnet1d': ResNet1dDecoder, - 'resnet2d': ResNet2dDecoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } - - -def init_data(data_rspec, train_list, val_list, - train_pair_list, val_pair_list, - num_workers, num_gpus, rank, **kwargs): + "dc1d": DC1dDecoder, + "dc2d": DC2dDecoder, + "resnet1d": ResNet1dDecoder, + "resnet2d": ResNet2dDecoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} + + +def init_data( + data_rspec, + train_list, + val_list, + train_pair_list, + val_pair_list, + num_workers, + num_gpus, + rank, + **kwargs +): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, train_pair_list, **sd_args) val_data = SD(data_rspec, val_list, val_pair_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_model(rank, **kwargs): - Encoder = kwargs['enc_class'] - Decoder = kwargs['dec_class'] - enc_args = Encoder.filter_args(**kwargs['enc']) - dec_args = Decoder.filter_args(**kwargs['dec']) + Encoder = kwargs["enc_class"] + Decoder = kwargs["dec_class"] + enc_args = Encoder.filter_args(**kwargs["enc"]) + dec_args = Decoder.filter_args(**kwargs["dec"]) vae_args = VAE.filter_args(**kwargs) # add some extra arguments if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['in_feats'] = kwargs['in_feats'] + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + enc_args["in_feats"] = kwargs["in_feats"] - if Encoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['out_time_dim'] = -1 + if Encoder in (TransformerEncoderV1, ConformerEncoderV1): + enc_args["out_time_dim"] = -1 - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['out_time_dim'] = -1 + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["out_time_dim"] = -1 if rank == 0: - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) if rank == 0: - logging.info('vae-model={}'.format(model)) + logging.info("vae-model={}".format(model)) return model @@ -120,21 +136,21 @@ def train_vae(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_model(**kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -142,8 +158,7 @@ def train_vae(gpu_id, args): ddp.ddp_cleanup() - -# (data_rspec, train_list, val_list, +# (data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -168,9 +183,9 @@ def train_vae(gpu_id, args): # logging.info('trainer args={}'.format(trn_args)) # logging.info('init datasets') -# train_data = SD(data_rspec, train_list, +# train_data = SD(data_rspec, train_list, # return_class=False, **sd_args) -# val_data = SD(data_rspec, val_list, +# val_data = SD(data_rspec, val_list, # return_class=False, is_val=True, **sd_args) # logging.info('init samplers') @@ -201,40 +216,42 @@ def train_vae(gpu_id, args): # trainer.load_last_checkpoint() # trainer.fit(train_loader, test_loader) + def make_parser(enc_class, dec_class): Encoder = enc_dict[enc_class] Decoder = dec_dict[dec_class] parser = ArgumentParser() - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) - parser.add_argument('--train-pair-list', required=True) - parser.add_argument('--val-pair-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) + parser.add_argument("--train-pair-list", required=True) + parser.add_argument("--val-pair-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_class_args(parser, prefix='enc') + Encoder.add_class_args(parser, prefix="enc") dec_args = {} - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['in_feats'] = True - Decoder.add_class_args(parser, prefix='dec', **dec_args) + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["in_feats"] = True + Decoder.add_class_args(parser, prefix="dec", **dec_args) VAE.add_class_args(parser) Trainer.add_class_args(parser) @@ -242,28 +259,31 @@ def make_parser(enc_class, dec_class): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train VQ Denoising VAE') + parser = ArgumentParser(description="Train VQ Denoising VAE") - parser.add_argument('--local_rank', default=0, type=int) - parser.add_argument('--cfg', action=ActionConfigFile) + parser.add_argument("--local_rank", default=0, type=int) + parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() for ke, ve in enc_dict.items(): for kd, vd in dec_dict.items(): - k = '%s:%s' % (ke, kd) + k = "%s:%s" % (ke, kd) parser_k = make_parser(ke, kd) subcommands.add_subcommand(k, parser_k) @@ -276,20 +296,18 @@ def make_parser(enc_class, dec_class): if gpu_id == 0: try: - config_file = Path(args_sc.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args_sc.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass - ed = vae_type.split(':') + ed = vae_type.split(":") args_sc.enc_class = enc_dict[ed[0]] args_sc.dec_class = dec_dict[ed[1]] # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_vae(gpu_id, args_sc) - - # parser.add_argument('--local_rank', default=0, type=int) # args = parser.parse_args() diff --git a/hyperion/bin/torch-train-vq-vae.py b/hyperion/bin/torch-train-vq-vae.py index cdf7bb31..307adef2 100755 --- a/hyperion/bin/torch-train-vq-vae.py +++ b/hyperion/bin/torch-train-vq-vae.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -30,85 +35,87 @@ from hyperion.torch.data import ClassWeightedSeqSampler as Sampler enc_dict = { - 'dc1d': DC1dEncoder, - 'dc2d': DC2dEncoder, - 'resnet1d': ResNet1dEncoder, - 'resnet2d': ResNet2dEncoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } + "dc1d": DC1dEncoder, + "dc2d": DC2dEncoder, + "resnet1d": ResNet1dEncoder, + "resnet2d": ResNet2dEncoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} dec_dict = { - 'dc1d': DC1dDecoder, - 'dc2d': DC2dDecoder, - 'resnet1d': ResNet1dDecoder, - 'resnet2d': ResNet2dDecoder, - 'transformer-enc-v1': TransformerEncoderV1, - 'conformer-enc-v1': ConformerEncoderV1 } + "dc1d": DC1dDecoder, + "dc2d": DC2dDecoder, + "resnet1d": ResNet1dDecoder, + "resnet2d": ResNet2dDecoder, + "transformer-enc-v1": TransformerEncoderV1, + "conformer-enc-v1": ConformerEncoderV1, +} -def init_data(data_rspec, train_list, val_list, - num_workers, num_gpus, rank, **kwargs): +def init_data(data_rspec, train_list, val_list, num_workers, num_gpus, rank, **kwargs): sd_args = SD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(sd_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(sd_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = SD(data_rspec, train_list, **sd_args) val_data = SD(data_rspec, val_list, is_val=True, **sd_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_model(rank, **kwargs): - Encoder = kwargs['enc_class'] - Decoder = kwargs['dec_class'] - enc_args = Encoder.filter_args(**kwargs['enc']) - dec_args = Decoder.filter_args(**kwargs['dec']) + Encoder = kwargs["enc_class"] + Decoder = kwargs["dec_class"] + enc_args = Encoder.filter_args(**kwargs["enc"]) + dec_args = Decoder.filter_args(**kwargs["dec"]) vae_args = VAE.filter_args(**kwargs) # add some extra arguments if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['in_feats'] = kwargs['in_feats'] + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + enc_args["in_feats"] = kwargs["in_feats"] - if Encoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - enc_args['out_time_dim'] = -1 + if Encoder in (TransformerEncoderV1, ConformerEncoderV1): + enc_args["out_time_dim"] = -1 - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['out_time_dim'] = -1 + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["out_time_dim"] = -1 if rank == 0: - logging.info('encoder args={}'.format(enc_args)) - logging.info('decoder args={}'.format(dec_args)) - logging.info('vae args={}'.format(vae_args)) + logging.info("encoder args={}".format(enc_args)) + logging.info("decoder args={}".format(dec_args)) + logging.info("vae args={}".format(vae_args)) encoder = Encoder(**enc_args) decoder = Decoder(**dec_args) model = VAE(encoder, decoder, **vae_args) if rank == 0: - logging.info('vae-model={}'.format(model)) + logging.info("vae-model={}".format(model)) return model @@ -119,21 +126,21 @@ def train_vae(gpu_id, args): kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) model = init_model(**kwargs) trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'mse': nn.MSELoss(), 'L1': nn.L1Loss() } - trainer = Trainer(model, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"mse": nn.MSELoss(), "L1": nn.L1Loss()} + trainer = Trainer( + model, device=device, metrics=metrics, ddp=world_size > 1, **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -141,8 +148,7 @@ def train_vae(gpu_id, args): ddp.ddp_cleanup() - -# (data_rspec, train_list, val_list, +# (data_rspec, train_list, val_list, # num_gpus, resume, num_workers, **kwargs): # set_float_cpu('float32') @@ -167,9 +173,9 @@ def train_vae(gpu_id, args): # logging.info('trainer args={}'.format(trn_args)) # logging.info('init datasets') -# train_data = SD(data_rspec, train_list, +# train_data = SD(data_rspec, train_list, # return_class=False, **sd_args) -# val_data = SD(data_rspec, val_list, +# val_data = SD(data_rspec, val_list, # return_class=False, is_val=True, **sd_args) # logging.info('init samplers') @@ -200,38 +206,40 @@ def train_vae(gpu_id, args): # trainer.load_last_checkpoint() # trainer.fit(train_loader, test_loader) + def make_parser(enc_class, dec_class): Encoder = enc_dict[enc_class] Decoder = dec_dict[dec_class] parser = ArgumentParser() - parser.add_argument('--data-rspec', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--data-rspec", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) SD.add_argparse_args(parser) Sampler.add_argparse_args(parser) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) if Encoder in ( - DC1dEncoder, - ResNet1dEncoder, - TransformerEncoderV1, - ConformerEncoderV1): - parser.add_argument('--in-feats', type=int, required=True, - help='input features dimension') + DC1dEncoder, + ResNet1dEncoder, + TransformerEncoderV1, + ConformerEncoderV1, + ): + parser.add_argument( + "--in-feats", type=int, required=True, help="input features dimension" + ) - Encoder.add_class_args(parser, prefix='enc') + Encoder.add_class_args(parser, prefix="enc") dec_args = {} - if Decoder in ( - TransformerEncoderV1, - ConformerEncoderV1): - dec_args['in_feats'] = True - Decoder.add_class_args(parser, prefix='dec', **dec_args) + if Decoder in (TransformerEncoderV1, ConformerEncoderV1): + dec_args["in_feats"] = True + Decoder.add_class_args(parser, prefix="dec", **dec_args) VAE.add_class_args(parser) @@ -240,28 +248,31 @@ def make_parser(enc_class, dec_class): # parser.add_argument('--num-gpus', type=int, default=1, # help='number of gpus, if 0 it uses cpu') - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) return parser -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train VAE') + parser = ArgumentParser(description="Train VAE") - parser.add_argument('--local_rank', default=0, type=int) - parser.add_argument('--cfg', action=ActionConfigFile) + parser.add_argument("--local_rank", default=0, type=int) + parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() for ke, ve in enc_dict.items(): for kd, vd in dec_dict.items(): - k = '%s:%s' % (ke, kd) + k = "%s:%s" % (ke, kd) parser_k = make_parser(ke, kd) subcommands.add_subcommand(k, parser_k) @@ -274,20 +285,18 @@ def make_parser(enc_class, dec_class): if gpu_id == 0: try: - config_file = Path(args_sc.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args_sc.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass - ed = vae_type.split(':') + ed = vae_type.split(":") args_sc.enc_class = enc_dict[ed[0]] args_sc.dec_class = dec_dict[ed[1]] # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_vae(gpu_id, args_sc) - - # parser.add_argument('--local_rank', default=0, type=int) # args = parser.parse_args() diff --git a/hyperion/bin/torch-train-xvec-from-wav.py b/hyperion/bin/torch-train-xvec-from-wav.py index 4e1d5856..c95aae5f 100755 --- a/hyperion/bin/torch-train-xvec-from-wav.py +++ b/hyperion/bin/torch-train-xvec-from-wav.py @@ -6,10 +6,15 @@ import sys import os from pathlib import Path -from jsonargparse import ArgumentParser, ActionConfigFile, ActionParser, namespace_to_dict +from jsonargparse import ( + ArgumentParser, + ActionConfigFile, + ActionParser, + namespace_to_dict, +) import time import logging -import multiprocessing +import multiprocessing import numpy as np @@ -29,76 +34,93 @@ from hyperion.torch.models import TDNNXVector as TDXVec from hyperion.torch.models import TransformerXVectorV1 as TFXVec -xvec_dict = {'resnet': RXVec, 'efficientnet': EXVec, 'tdnn': TDXVec, 'transformer': TFXVec} - - -def init_data(audio_path, train_list, val_list, - train_aug_cfg, val_aug_cfg, num_workers, - num_gpus, rank, **kwargs): +xvec_dict = { + "resnet": RXVec, + "efficientnet": EXVec, + "tdnn": TDXVec, + "transformer": TFXVec, +} + + +def init_data( + audio_path, + train_list, + val_list, + train_aug_cfg, + val_aug_cfg, + num_workers, + num_gpus, + rank, + **kwargs +): ad_args = AD.filter_args(**kwargs) sampler_args = Sampler.filter_args(**kwargs) if rank == 0: - logging.info('audio dataset args={}'.format(ad_args)) - logging.info('sampler args={}'.format(sampler_args)) - logging.info('init datasets') + logging.info("audio dataset args={}".format(ad_args)) + logging.info("sampler args={}".format(sampler_args)) + logging.info("init datasets") train_data = AD(audio_path, train_list, aug_cfg=train_aug_cfg, **ad_args) val_data = AD(audio_path, val_list, aug_cfg=val_aug_cfg, is_val=True, **ad_args) if rank == 0: - logging.info('init samplers') + logging.info("init samplers") train_sampler = Sampler(train_data, **sampler_args) val_sampler = Sampler(val_data, **sampler_args) num_workers_per_gpu = int((num_workers + num_gpus - 1) / num_gpus) - largs = {'num_workers': num_workers_per_gpu, 'pin_memory': True} if num_gpus > 0 else {} + largs = ( + {"num_workers": num_workers_per_gpu, "pin_memory": True} if num_gpus > 0 else {} + ) train_loader = torch.utils.data.DataLoader( - train_data, batch_sampler = train_sampler, **largs) + train_data, batch_sampler=train_sampler, **largs + ) test_loader = torch.utils.data.DataLoader( - val_data, batch_sampler = val_sampler, **largs) + val_data, batch_sampler=val_sampler, **largs + ) return train_loader, test_loader def init_feats(rank, **kwargs): - feat_args = AF.filter_args(**kwargs['feats']) + feat_args = AF.filter_args(**kwargs["feats"]) if rank == 0: - logging.info('feat args={}'.format(feat_args)) - logging.info('initializing feature extractor') + logging.info("feat args={}".format(feat_args)) + logging.info("initializing feature extractor") feat_extractor = AF(trans=True, **feat_args) if rank == 0: - logging.info('feat-extractor={}'.format(feat_extractor)) + logging.info("feat-extractor={}".format(feat_extractor)) return feat_extractor def init_xvector(num_classes, rank, xvec_class, **kwargs): - + xvec_args = xvec_class.filter_args(**kwargs) if rank == 0: - logging.info('xvector network args={}'.format(xvec_args)) - xvec_args['num_classes'] = num_classes + logging.info("xvector network args={}".format(xvec_args)) + xvec_args["num_classes"] = num_classes model = xvec_class(**xvec_args) if rank == 0: - logging.info('x-vector-model={}'.format(model)) + logging.info("x-vector-model={}".format(model)) return model def train_xvec(gpu_id, args): - + config_logger(args.verbose) del args.verbose logging.debug(args) kwargs = namespace_to_dict(args) torch.manual_seed(args.seed) - set_float_cpu('float32') + set_float_cpu("float32") ddp_args = ddp.filter_ddp_args(**kwargs) device, rank, world_size = ddp.ddp_init(gpu_id, **ddp_args) - kwargs['rank'] = rank + kwargs["rank"] = rank train_loader, test_loader = init_data(**kwargs) feat_extractor = init_feats(**kwargs) @@ -106,11 +128,16 @@ def train_xvec(gpu_id, args): trn_args = Trainer.filter_args(**kwargs) if rank == 0: - logging.info('trainer args={}'.format(trn_args)) - metrics = { 'acc': CategoricalAccuracy() } - trainer = Trainer(model, feat_extractor, - device=device, metrics=metrics, - ddp=world_size>1, **trn_args) + logging.info("trainer args={}".format(trn_args)) + metrics = {"acc": CategoricalAccuracy()} + trainer = Trainer( + model, + feat_extractor, + device=device, + metrics=metrics, + ddp=world_size > 1, + **trn_args + ) if args.resume: trainer.load_last_checkpoint() trainer.fit(train_loader, test_loader) @@ -121,48 +148,52 @@ def train_xvec(gpu_id, args): def make_parser(xvec_class): parser = ArgumentParser() - parser.add_argument('--cfg', action=ActionConfigFile) - parser.add_argument('--audio-path', required=True) - parser.add_argument('--train-list', required=True) - parser.add_argument('--val-list', required=True) + parser.add_argument("--cfg", action=ActionConfigFile) + parser.add_argument("--audio-path", required=True) + parser.add_argument("--train-list", required=True) + parser.add_argument("--val-list", required=True) AD.add_class_args(parser) Sampler.add_class_args(parser) - parser.add_argument('--train-aug-cfg', default=None) - parser.add_argument('--val-aug-cfg', default=None) + parser.add_argument("--train-aug-cfg", default=None) + parser.add_argument("--val-aug-cfg", default=None) - parser.add_argument('--num-workers', type=int, default=5, - help='num_workers of data loader') + parser.add_argument( + "--num-workers", type=int, default=5, help="num_workers of data loader" + ) - AF.add_class_args(parser, prefix='feats') + AF.add_class_args(parser, prefix="feats") xvec_class.add_class_args(parser) Trainer.add_class_args(parser) ddp.add_ddp_args(parser) - parser.add_argument('--seed', type=int, default=1123581321, - help='random seed') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - + parser.add_argument("--seed", type=int, default=1123581321, help="random seed") + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + return parser -if __name__ == '__main__': +if __name__ == "__main__": - parser = ArgumentParser( - description='Train XVector from audio files') + parser = ArgumentParser(description="Train XVector from audio files") - parser.add_argument('--local_rank', default=0, type=int) - parser.add_argument('--cfg', action=ActionConfigFile) + parser.add_argument("--local_rank", default=0, type=int) + parser.add_argument("--cfg", action=ActionConfigFile) subcommands = parser.add_subcommands() for k, v in xvec_dict.items(): parser_k = make_parser(v) subcommands.add_subcommand(k, parser_k) - + args = parser.parse_args() gpu_id = args.local_rank del args.local_rank @@ -172,13 +203,12 @@ def make_parser(xvec_class): if gpu_id == 0: try: - config_file = Path(args_sc.exp_path) / 'config.yaml' - parser.save(args, str(config_file), format='yaml', overwrite=True) + config_file = Path(args_sc.exp_path) / "config.yaml" + parser.save(args, str(config_file), format="yaml", overwrite=True) except: pass args_sc.xvec_class = xvec_dict[xvec_type] # torch docs recommend using forkserver - multiprocessing.set_start_method('forkserver') + multiprocessing.set_start_method("forkserver") train_xvec(gpu_id, args_sc) - diff --git a/hyperion/bin/train-cw-up.py b/hyperion/bin/train-cw-up.py index 75cc8855..48b8dfc4 100755 --- a/hyperion/bin/train-cw-up.py +++ b/hyperion/bin/train-cw-up.py @@ -39,12 +39,20 @@ def load_model(input_path, with_lnorm, name, **kwargs): for tf in tfl.transforms: if tf.name == name: return tf - -def train_cw(iv_file, train_list, preproc_file, with_lnorm, - save_tlist, append_tlist, input_path, output_path, **kwargs): - +def train_cw( + iv_file, + train_list, + preproc_file, + with_lnorm, + save_tlist, + append_tlist, + input_path, + output_path, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -58,17 +66,17 @@ def train_cw(iv_file, train_list, preproc_file, with_lnorm, model_args = CentWhitenUP.filter_args(**kwargs) model = load_model(input_path, with_lnorm, **model_args) - + model.fit(x) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + x = model.predict(x) - x = x[:,:int(x.shape[-1]/2)] - gauss=Normal(x_dim=x.shape[1]) + x = x[:, : int(x.shape[-1] / 2)] + gauss = Normal(x_dim=x.shape[1]) gauss.fit(x=x) logging.debug(gauss.mu[:4]) - logging.debug(gauss.Sigma[:4,:4]) + logging.debug(gauss.Sigma[:4, :4]) if save_tlist: if append_tlist and preproc is not None: @@ -78,39 +86,42 @@ def train_cw(iv_file, train_list, preproc_file, with_lnorm, model = TransformList(model) model.save(output_path) - - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Centering+Whitening') + fromfile_prefix_chars="@", + description="Train Centering+Whitening", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - VR.add_argparse_args(parser) CentWhitenUP.add_argparse_args(parser) - - parser.add_argument('--input-path', dest='input_path', default=None) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--no-lnorm', dest='with_lnorm', - default=True, action='store_false') - - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--input-path", dest="input_path", default=None) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "--no-lnorm", dest="with_lnorm", default=True, action="store_false" + ) + + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_cw(**vars(args)) - + train_cw(**vars(args)) diff --git a/hyperion/bin/train-cw.py b/hyperion/bin/train-cw.py index 06a7b0b2..c64d4892 100755 --- a/hyperion/bin/train-cw.py +++ b/hyperion/bin/train-cw.py @@ -39,12 +39,20 @@ def load_model(input_path, with_lnorm, name, **kwargs): for tf in tfl.transforms: if tf.name == name: return tf - -def train_cw(iv_file, train_list, preproc_file, with_lnorm, - save_tlist, append_tlist, input_path, output_path, **kwargs): - +def train_cw( + iv_file, + train_list, + preproc_file, + with_lnorm, + save_tlist, + append_tlist, + input_path, + output_path, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -58,17 +66,17 @@ def train_cw(iv_file, train_list, preproc_file, with_lnorm, model_args = CentWhiten.filter_args(**kwargs) model = load_model(input_path, with_lnorm, **model_args) - + model.fit(x) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + x = model.predict(x) - gauss=Normal(x_dim=x.shape[1]) + gauss = Normal(x_dim=x.shape[1]) gauss.fit(x=x) logging.debug(gauss.mu[:4]) - logging.debug(gauss.Sigma[:4,:4]) + logging.debug(gauss.Sigma[:4, :4]) if save_tlist: if append_tlist and preproc is not None: @@ -78,39 +86,42 @@ def train_cw(iv_file, train_list, preproc_file, with_lnorm, model = TransformList(model) model.save(output_path) - - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Centering+Whitening') + fromfile_prefix_chars="@", + description="Train Centering+Whitening", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - VR.add_argparse_args(parser) CentWhiten.add_argparse_args(parser) - - parser.add_argument('--input-path', dest='input_path', default=None) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--no-lnorm', dest='with_lnorm', - default=True, action='store_false') - - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--input-path", dest="input_path", default=None) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "--no-lnorm", dest="with_lnorm", default=True, action="store_false" + ) + + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_cw(**vars(args)) - + train_cw(**vars(args)) diff --git a/hyperion/bin/train-gaussianizer.py b/hyperion/bin/train-gaussianizer.py index 4dfa478f..eefd2456 100755 --- a/hyperion/bin/train-gaussianizer.py +++ b/hyperion/bin/train-gaussianizer.py @@ -33,12 +33,19 @@ def load_model(input_path, **kwargs): for tf in tfl.transforms: if tf.name == name: return tf - -def train_gauss(iv_file, train_list, preproc_file, - save_tlist, append_tlist, input_path, output_path, **kwargs): - +def train_gauss( + iv_file, + train_list, + preproc_file, + save_tlist, + append_tlist, + input_path, + output_path, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -52,7 +59,7 @@ def train_gauss(iv_file, train_list, preproc_file, model_args = Gaussianizer.filter_args(**kwargs) model = load_model(input_path, **model_args) - + model.fit(x) if save_tlist: @@ -64,58 +71,58 @@ def train_gauss(iv_file, train_list, preproc_file, model.save(output_path) - # import matplotlib # matplotlib.use('Agg') # import matplotlib.pyplot as plt # fig_file = '%s.D%04d1.pdf' % (output_path, 0) - + # plt.hist(y[:,0], 300) # plt.grid(True) # plt.show() # plt.savefig(fig_file) # plt.clf() - # fig_file = '%s.D%04d2.pdf' % (output_path, 0) - + # plt.hist(y2[:,0], 300) # plt.grid(True) # plt.show() # plt.savefig(fig_file) # plt.clf() - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains a Gaussianizer') + fromfile_prefix_chars="@", + description="Trains a Gaussianizer", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - VR.add_argparse_args(parser) Gaussianizer.add_argparse_args(parser) - - parser.add_argument('--input-path', dest='input_path', default=None) - parser.add_argument('--output-path', dest='output_path', required=True) - - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--input-path", dest="input_path", default=None) + parser.add_argument("--output-path", dest="output_path", required=True) + + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_gauss(**vars(args)) - + train_gauss(**vars(args)) diff --git a/hyperion/bin/train-lda.py b/hyperion/bin/train-lda.py index a8972d83..17cd5ab6 100755 --- a/hyperion/bin/train-lda.py +++ b/hyperion/bin/train-lda.py @@ -19,17 +19,25 @@ from hyperion.transforms import TransformList, LDA, SbSw -def train_lda(iv_file, train_list, preproc_file, lda_dim, - name, save_tlist, append_tlist, output_path, **kwargs): +def train_lda( + iv_file, + train_list, + preproc_file, + lda_dim, + name, + save_tlist, + append_tlist, + output_path, + **kwargs +): - if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vcr_args = VCR.filter_args(**kwargs) - vcr = VCR(iv_file, train_list, preproc, **vcr_args) + vcr = VCR(iv_file, train_list, preproc, **vcr_args) x, class_ids = vcr.read() t1 = time.time() @@ -37,15 +45,15 @@ def train_lda(iv_file, train_list, preproc_file, lda_dim, model = LDA(lda_dim=lda_dim, name=name) model.fit(x, class_ids) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + x = model.predict(x) s_mat = SbSw() s_mat.fit(x, class_ids) - logging.debug(s_mat.Sb[:4,:4]) - logging.debug(s_mat.Sw[:4,:4]) - + logging.debug(s_mat.Sb[:4, :4]) + logging.debug(s_mat.Sw[:4, :4]) + if save_tlist: if append_tlist and preproc is not None: preproc.append(model) @@ -54,35 +62,34 @@ def train_lda(iv_file, train_list, preproc_file, lda_dim, model = TransformList(model) model.save(output_path) - - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train LDA') + fromfile_prefix_chars="@", + description="Train LDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--lda-dim', dest='lda_dim', type=int, - default=None) - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('--name', dest='name', default='lda') - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--lda-dim", dest="lda_dim", type=int, default=None) + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument("--name", dest="name", default="lda") + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) train_lda(**vars(args)) - - diff --git a/hyperion/bin/train-linear-gbe-up.py b/hyperion/bin/train-linear-gbe-up.py index 6e01f8a0..3e102b1f 100755 --- a/hyperion/bin/train-linear-gbe-up.py +++ b/hyperion/bin/train-linear-gbe-up.py @@ -21,9 +21,8 @@ from hyperion.classifiers import LinearGBEUP as GBE -def train_linear_gbe(iv_file, train_list, preproc_file, - output_path, **kwargs): - +def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,35 +37,34 @@ def train_linear_gbe(iv_file, train_list, preproc_file, model_args = GBE.filter_train_args(**kwargs) model = GBE(**model_args) model.fit(x, class_ids) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) model.save(output_path) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train linear Gaussian back-end with uncertainty propagation') + fromfile_prefix_chars="@", + description="Train linear Gaussian back-end with uncertainty propagation", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) GBE.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, - choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_linear_gbe(**vars(args)) - + train_linear_gbe(**vars(args)) diff --git a/hyperion/bin/train-linear-gbe.py b/hyperion/bin/train-linear-gbe.py index 08e6414f..1428358e 100755 --- a/hyperion/bin/train-linear-gbe.py +++ b/hyperion/bin/train-linear-gbe.py @@ -21,10 +21,8 @@ from hyperion.classifiers import LinearGBE as GBE -def train_linear_gbe(iv_file, train_list, preproc_file, - output_path, - **kwargs): - +def train_linear_gbe(iv_file, train_list, preproc_file, output_path, **kwargs): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,34 +37,34 @@ def train_linear_gbe(iv_file, train_list, preproc_file, model_args = GBE.filter_train_args(**kwargs) model = GBE(**model_args) model.fit(x, class_ids) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) model.save(output_path) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train linear GBE') + fromfile_prefix_chars="@", + description="Train linear GBE", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) GBE.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_linear_gbe(**vars(args)) - + train_linear_gbe(**vars(args)) diff --git a/hyperion/bin/train-linear-svmc.py b/hyperion/bin/train-linear-svmc.py index a38a558a..6c0e2de2 100755 --- a/hyperion/bin/train-linear-svmc.py +++ b/hyperion/bin/train-linear-svmc.py @@ -21,9 +21,8 @@ from hyperion.classifiers import LinearSVMC as SVM -def train_svm(iv_file, train_list, preproc_file, - output_path, **kwargs): - +def train_svm(iv_file, train_list, preproc_file, output_path, **kwargs): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,34 +37,34 @@ def train_svm(iv_file, train_list, preproc_file, model_args = SVM.filter_train_args(**kwargs) model = SVM(**model_args) model.fit(x, class_ids) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) model.save(output_path) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train linear SVM classifier') + fromfile_prefix_chars="@", + description="Train linear SVM classifier", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) SVM.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) train_svm(**vars(args)) - - diff --git a/hyperion/bin/train-logistic-regression.py b/hyperion/bin/train-logistic-regression.py index b61200b4..6a409119 100755 --- a/hyperion/bin/train-logistic-regression.py +++ b/hyperion/bin/train-logistic-regression.py @@ -21,9 +21,8 @@ from hyperion.classifiers import LogisticRegression as LR -def train_lr(iv_file, train_list, preproc_file, - output_path, **kwargs): - +def train_lr(iv_file, train_list, preproc_file, output_path, **kwargs): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,34 +37,34 @@ def train_lr(iv_file, train_list, preproc_file, model_args = LR.filter_train_args(**kwargs) model = LR(**model_args) model.fit(x, class_ids) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) model.save(output_path) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train linear logistic regression classifier') + fromfile_prefix_chars="@", + description="Train linear logistic regression classifier", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) LR.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_lr(**vars(args)) - + train_lr(**vars(args)) diff --git a/hyperion/bin/train-mvn.py b/hyperion/bin/train-mvn.py index 22229bd0..8ddc5e92 100755 --- a/hyperion/bin/train-mvn.py +++ b/hyperion/bin/train-mvn.py @@ -21,9 +21,17 @@ from hyperion.transforms import TransformList, MVN, SbSw -def train_mvn(iv_file, train_list, preproc_file, - name, save_tlist, append_tlist, output_path, **kwargs): - +def train_mvn( + iv_file, + train_list, + preproc_file, + name, + save_tlist, + append_tlist, + output_path, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,8 +46,8 @@ def train_mvn(iv_file, train_list, preproc_file, model = MVN(name=name) model.fit(x) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + if save_tlist: if append_tlist and preproc is not None: preproc.append(model) @@ -48,34 +56,36 @@ def train_mvn(iv_file, train_list, preproc_file, model = TransformList(model) model.save(output_path) - - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train Global Mean and Variance Normalization') + fromfile_prefix_chars="@", + description="Train Global Mean and Variance Normalization", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VR.add_argparse_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('--name', dest='name', default='mvn') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument("--name", dest="name", default="mvn") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_mvn(**vars(args)) - + train_mvn(**vars(args)) diff --git a/hyperion/bin/train-nda.py b/hyperion/bin/train-nda.py index 2ec88b13..dcc856ed 100755 --- a/hyperion/bin/train-nda.py +++ b/hyperion/bin/train-nda.py @@ -20,18 +20,27 @@ from hyperion.transforms import TransformList, NDA, NSbSw -def train_nda(iv_file, train_list, preproc_file, - nda_dim, K, alpha, - name, save_tlist, append_tlist, output_path, **kwargs): +def train_nda( + iv_file, + train_list, + preproc_file, + nda_dim, + K, + alpha, + name, + save_tlist, + append_tlist, + output_path, + **kwargs +): - if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None vr_args = VCR.filter_args(**kwargs) - vcr = VCR(iv_file, train_list, preproc, **vr_args) + vcr = VCR(iv_file, train_list, preproc, **vr_args) x, class_ids = vcr.read() t1 = time.time() @@ -42,15 +51,15 @@ def train_nda(iv_file, train_list, preproc_file, model = NDA(name=name) model.fit(mu=s_mat.mu, Sb=s_mat.Sb, Sw=s_mat.Sw, nda_dim=nda_dim) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + x = model.predict(x) s_mat = NSbSw() s_mat.fit(x, class_ids) - logging.debug(s_mat.Sb[:4,:4]) - logging.debug(s_mat.Sw[:4,:4]) - + logging.debug(s_mat.Sb[:4, :4]) + logging.debug(s_mat.Sw[:4, :4]) + if save_tlist: if append_tlist and preproc is not None: preproc.append(model) @@ -59,43 +68,42 @@ def train_nda(iv_file, train_list, preproc_file, model = TransformList(model) model.save(output_path) - - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train NDA') + fromfile_prefix_chars="@", + description="Train NDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--nda-dim', dest='nda_dim', type=int, - default=None) - parser.add_argument('--k', dest='K', type=int, - default=10) - parser.add_argument('--alpha', dest='alpha', type=float, - default=1) - - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('--name', dest='name', default='nda') - parser.add_argument('--vector-score-file', dest='vector_score_file', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--nda-dim", dest="nda_dim", type=int, default=None) + parser.add_argument("--k", dest="K", type=int, default=10) + parser.add_argument("--alpha", dest="alpha", type=float, default=1) + + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument("--name", dest="name", default="nda") + parser.add_argument("--vector-score-file", dest="vector_score_file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_nda(**vars(args)) - + train_nda(**vars(args)) diff --git a/hyperion/bin/train-pca.py b/hyperion/bin/train-pca.py index a455eec0..b82a7772 100755 --- a/hyperion/bin/train-pca.py +++ b/hyperion/bin/train-pca.py @@ -33,9 +33,17 @@ def load_model(input_path, name, **kwargs): return tf -def train_pca(iv_file, train_list, preproc_file, - save_tlist, append_tlist, input_path, output_path, **kwargs): - +def train_pca( + iv_file, + train_list, + preproc_file, + save_tlist, + append_tlist, + input_path, + output_path, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -49,11 +57,11 @@ def train_pca(iv_file, train_list, preproc_file, model_args = PCA.filter_args(**kwargs) model = load_model(input_path, **model_args) - + model.fit(x) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) + if save_tlist: if append_tlist and preproc is not None: preproc.append(model) @@ -62,37 +70,39 @@ def train_pca(iv_file, train_list, preproc_file, model = TransformList(model) model.save(output_path) - - - + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train PCA') + fromfile_prefix_chars="@", + description="Train PCA", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - VR.add_argparse_args(parser) PCA.add_argparse_args(parser) - - parser.add_argument('--input-path', dest='input_path', default=None) - parser.add_argument('--output-path', dest='output_path', required=True) - - parser.add_argument('--no-save-tlist', dest='save_tlist', - default=True, action='store_false') - parser.add_argument('--no-append-tlist', dest='append_tlist', - default=True, action='store_false') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + + parser.add_argument("--input-path", dest="input_path", default=None) + parser.add_argument("--output-path", dest="output_path", required=True) + + parser.add_argument( + "--no-save-tlist", dest="save_tlist", default=True, action="store_false" + ) + parser.add_argument( + "--no-append-tlist", dest="append_tlist", default=True, action="store_false" + ) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_pca(**vars(args)) - + train_pca(**vars(args)) diff --git a/hyperion/bin/train-plda.py b/hyperion/bin/train-plda.py index 98110663..ba9a40c2 100755 --- a/hyperion/bin/train-plda.py +++ b/hyperion/bin/train-plda.py @@ -21,10 +21,18 @@ from hyperion.transforms import TransformList -def train_plda(iv_file, train_list, val_list, preproc_file, - epochs, ml_md, md_epochs, - output_path, **kwargs): - +def train_plda( + iv_file, + train_list, + val_list, + preproc_file, + epochs, + ml_md, + md_epochs, + output_path, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,48 +47,56 @@ def train_plda(iv_file, train_list, val_list, preproc_file, if val_list is not None: vcr_val = VCR(iv_file, val_list, preproc, **vcr_args) x_val, class_ids_val = vcr_val.read() - + t1 = time.time() plda_args = F.filter_train_args(**kwargs) model = F.create_plda(**plda_args) - elbos = model.fit(x, class_ids, x_val=x_val, class_ids_val=class_ids_val, - epochs=epochs, ml_md=ml_md, md_epochs=md_epochs) + elbos = model.fit( + x, + class_ids, + x_val=x_val, + class_ids_val=class_ids_val, + epochs=epochs, + ml_md=ml_md, + md_epochs=md_epochs, + ) + + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) - model.save(output_path) elbo = np.vstack(elbos) num = np.arange(epochs) elbo = np.vstack((num, elbo)).T - elbo_path=os.path.splitext(output_path)[0] + '.csv' - np.savetxt(elbo_path, elbo, delimiter=',') - - + elbo_path = os.path.splitext(output_path)[0] + ".csv" + np.savetxt(elbo_path, elbo, delimiter=",") + + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train PLDA') + fromfile_prefix_chars="@", + description="Train PLDA", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', default=None) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", default=None) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) F.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_plda(**vars(args)) - + train_plda(**vars(args)) diff --git a/hyperion/bin_deprec/ark2hyp.py b/hyperion/bin_deprec/ark2hyp.py index 7ddb5b1d..45a20712 100755 --- a/hyperion/bin_deprec/ark2hyp.py +++ b/hyperion/bin_deprec/ark2hyp.py @@ -17,38 +17,41 @@ from hyperion.hyp_defs import config_logger from hyperion.io import HypDataWriter, KaldiDataReader + def ark2hyp(input_file, input_dir, output_file, field, chunk_size, squeeze): output_dir = os.path.dirname(output_file) if not os.path.exists(output_dir): os.makedirs(output_dir) - + ark_r = KaldiDataReader(input_file, input_dir) h_w = HypDataWriter(output_file) - - while not(ark_r.eof()): + + while not (ark_r.eof()): X, keys = ark_r.read(num_records=chunk_size, squeeze=squeeze) h_w.write(keys, field, X) if __name__ == "__main__": - - parser=argparse.ArgumentParser( + + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Compacts .arr files into a hdf5 file.') - - parser.add_argument('--input-file',dest='input_file', required=True) - parser.add_argument('--input-dir', dest='input_dir', default=None) - parser.add_argument('--output-file', dest='output_file', required=True) - parser.add_argument('--field', dest='field', default='') - parser.add_argument('--chunk-size', dest='chunk_size', type=int, default=None) - parser.add_argument('--squeeze', dest='squeeze', default=False, action='store_true') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Compacts .arr files into a hdf5 file.", + ) + + parser.add_argument("--input-file", dest="input_file", required=True) + parser.add_argument("--input-dir", dest="input_dir", default=None) + parser.add_argument("--output-file", dest="output_file", required=True) + parser.add_argument("--field", dest="field", default="") + parser.add_argument("--chunk-size", dest="chunk_size", type=int, default=None) + parser.add_argument("--squeeze", dest="squeeze", default=False, action="store_true") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose - + ark2hyp(**vars(args)) - diff --git a/hyperion/bin_deprec/arkvad2nist.py b/hyperion/bin_deprec/arkvad2nist.py index f69b8f27..bd15592a 100755 --- a/hyperion/bin_deprec/arkvad2nist.py +++ b/hyperion/bin_deprec/arkvad2nist.py @@ -18,10 +18,11 @@ from hyperion.io import KaldiDataReader + def bin2intervals(vad): delta = np.abs(np.diff(vad)) - change_points = np.where(delta>0)[0] - num_interv = len(change_points)+1 + change_points = np.where(delta > 0)[0] + num_interv = len(change_points) + 1 if vad[0] == 1: speech = True else: @@ -55,41 +56,50 @@ def write_opensat(file_vad, key, vad): file_dir = os.path.dirname(file_vad) if not os.path.exists(file_dir): os.makedirs(file_dir) - with open(file_vad, 'w') as f: + with open(file_vad, "w") as f: start, stop, state, conf = bin2intervals(vad) for i in range(len(start)): - f.write('X\tX\tX\tSAD\t%s\t%.2f\t%.2f\t%s\t%.2f\n' % ( - key, start[i], stop[i], - 'speech' if state[i] else 'non-speech', conf[i])) - + f.write( + "X\tX\tX\tSAD\t%s\t%.2f\t%.2f\t%s\t%.2f\n" + % ( + key, + start[i], + stop[i], + "speech" if state[i] else "non-speech", + conf[i], + ) + ) + def arkvad2nist(input_file, input_dir, output_dir): ark_r = KaldiDataReader(input_file, input_dir) - while not(ark_r.eof()): + while not (ark_r.eof()): X, keys = ark_r.read(num_records=1) - #print(X) - #print(keys) - file_vad = output_dir + '/' + keys[0] + '.txt' + # print(X) + # print(keys) + file_vad = output_dir + "/" + keys[0] + ".txt" write_opensat(file_vad, keys[0], X[0]) if __name__ == "__main__": - - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Converts from Kaldi VAD ark file to NIST OpenSAT format') - parser.add_argument('--input-file',dest='input_file', required=True) - parser.add_argument('--input-dir', dest='input_dir', default=None) - parser.add_argument('--output-dir', dest='output_dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Converts from Kaldi VAD ark file to NIST OpenSAT format", + ) + + parser.add_argument("--input-file", dest="input_file", required=True) + parser.add_argument("--input-dir", dest="input_dir", default=None) + parser.add_argument("--output-dir", dest="output_dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose arkvad2nist(**vars(args)) - diff --git a/hyperion/bin_deprec/compute-gmm-post.py b/hyperion/bin_deprec/compute-gmm-post.py index 393c78e7..1b0a8d04 100755 --- a/hyperion/bin_deprec/compute-gmm-post.py +++ b/hyperion/bin_deprec/compute-gmm-post.py @@ -25,11 +25,11 @@ def to_sparse(r, num_comp): - index = np.argsort(r, axis=1)[:,-num_comp:] + index = np.argsort(r, axis=1)[:, -num_comp:] r_sparse = np.zeros((r.shape[0], num_comp), dtype=float_cpu()) for i, index_i in enumerate(index): r_sparse[i] = r[i, index_i] - r_sparse = r_sparse/np.sum(r_sparse, axis=-1, keepdims=True) + r_sparse = r_sparse / np.sum(r_sparse, axis=-1, keepdims=True) return r_sparse, index @@ -41,69 +41,77 @@ def to_dense(r_sparse, index, num_comp): return r -def compute_gmm_post(seq_file, file_list, model_file, preproc_file, output_path, - num_comp, **kwargs): +def compute_gmm_post( + seq_file, file_list, model_file, preproc_file, output_path, num_comp, **kwargs +): - sr_args = SR.filter_eval_args(**kwargs) - + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None gmm = DiagGMM.load_from_kaldi(model_file) - - sr = SR(seq_file, file_list, batch_size=1, - shuffle_seqs=False, - preproc=preproc, **sr_args) - + + sr = SR( + seq_file, + file_list, + batch_size=1, + shuffle_seqs=False, + preproc=preproc, + **sr_args + ) + t1 = time.time() - - logging.info(time.time()-t1) + + logging.info(time.time() - t1) index = np.zeros((sr.num_seqs, num_comp), dtype=int) hw = HypDataWriter(output_path) for i in range(sr.num_seqs): x, key = sr.read_next_seq() - logging.info('Extracting i-vector %d/%d for %s, num_frames: %d' % (i, sr.num_seqs, key, x.shape[0])) + logging.info( + "Extracting i-vector %d/%d for %s, num_frames: %d" + % (i, sr.num_seqs, key, x.shape[0]) + ) r = gmm.compute_z(x) r_s, index = to_sparse(r, num_comp) - if i==0: + if i == 0: r2 = to_dense(r_s, index, r.shape[1]) - logging.degug(np.sort(r[0,:])[-12:]) - logging.degug(np.sort(r2[0,:])[-12:]) - logging.degug(np.argsort(r[0,:])[-12:]) - logging.degug(np.argsort(r2[0,:])[-12:]) - - hw.write([key], '.r', [r_s]) - hw.write([key], '.index', [index]) - - logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) - - + logging.degug(np.sort(r[0, :])[-12:]) + logging.degug(np.sort(r2[0, :])[-12:]) + logging.degug(np.argsort(r[0, :])[-12:]) + logging.degug(np.argsort(r2[0, :])[-12:]) + + hw.write([key], ".r", [r_s]) + hw.write([key], ".index", [index]) + + logging.info("Extract elapsed time: %.2f" % (time.time() - t1)) + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Extract TVAE i-vectors') + fromfile_prefix_chars="@", + description="Extract TVAE i-vectors", + ) - parser.add_argument('--seq-file', dest='seq_file', required=True) - parser.add_argument('--file-list', dest='file_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('--num-comp', dest='num_comp', default=10) + parser.add_argument("--seq-file", dest="seq_file", required=True) + parser.add_argument("--file-list", dest="file_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument("--num-comp", dest="num_comp", default=10) SR.add_argparse_eval_args(parser) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose - - compute_gmm_post(**vars(args)) - + compute_gmm_post(**vars(args)) diff --git a/hyperion/bin_deprec/eval-2class-performance.py b/hyperion/bin_deprec/eval-2class-performance.py index 8cc9890d..a10ec5c0 100755 --- a/hyperion/bin_deprec/eval-2class-performance.py +++ b/hyperion/bin_deprec/eval-2class-performance.py @@ -20,39 +20,42 @@ from hyperion.utils.trial_key import TrialKey from hyperion.metrics import compute_eer + def eval_2class_performance(score_file, key_file, output_path): scr = TrialScores.load(score_file) key = TrialKey.load(key_file) output_dir = os.path.dirname(output_path) - if not(os.path.isdir(output_dir)): + if not (os.path.isdir(output_dir)): os.makedirs(output_dir, exist_ok=True) - + tar, non = scr.get_tar_non(key) eer = compute_eer(tar, non) - output_file=output_path + '.res' - with open(output_file, 'w') as f: - f.write('EER %.4f\nNTAR %d\nNNON %d\n' - % (eer, len(tar), len(non))) - - + output_file = output_path + ".res" + with open(output_file, "w") as f: + f.write("EER %.4f\nNTAR %d\nNNON %d\n" % (eer, len(tar), len(non))) + + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals EER, DCF, DET') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Evals EER, DCF, DET", + ) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--key-file', dest='key_file', required=True) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--key-file", dest="key_file", required=True) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + eval_2class_performance(**vars(args)) diff --git a/hyperion/bin_deprec/eval-elbo-ubm.py b/hyperion/bin_deprec/eval-elbo-ubm.py index 44df4c17..5cf1aa0d 100755 --- a/hyperion/bin_deprec/eval-elbo-ubm.py +++ b/hyperion/bin_deprec/eval-elbo-ubm.py @@ -21,29 +21,34 @@ from hyperion.pdfs import DiagGMM - -def eval_elbo(seq_file, file_list, model_file, preproc_file, - output_file, ubm_type, **kwargs): +def eval_elbo( + seq_file, file_list, model_file, preproc_file, output_file, ubm_type, **kwargs +): sr_args = SR.filter_eval_args(**kwargs) - + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: preproc = None - sr = SR(seq_file, file_list, batch_size=1, - shuffle_seqs=False, - preproc=preproc, **sr_args) - + sr = SR( + seq_file, + file_list, + batch_size=1, + shuffle_seqs=False, + preproc=preproc, + **sr_args + ) + t1 = time.time() - if ubm_type == 'diag-gmm': + if ubm_type == "diag-gmm": model = DiagGMM.load(model_file) else: model = DiagGMM.load_from_kaldi(model_file) model.initialize() - + elbo = np.zeros((sr.num_seqs,), dtype=float_cpu()) num_frames = np.zeros((sr.num_seqs,), dtype=int) keys = [] @@ -55,37 +60,42 @@ def eval_elbo(seq_file, file_list, model_file, preproc_file, num_total_frames = np.sum(num_frames) total_elbo = np.sum(elbo) - total_elbo_norm = total_elbo/num_total_frames - logging.info('Extract elapsed time: %.2f' % (time.time() - t1)) - s = 'Total ELBO: %f\nELBO_NORM %f' % (total_elbo, total_elbo_norm) + total_elbo_norm = total_elbo / num_total_frames + logging.info("Extract elapsed time: %.2f" % (time.time() - t1)) + s = "Total ELBO: %f\nELBO_NORM %f" % (total_elbo, total_elbo_norm) logging.info(s) - with open(output_file,'w') as f: + with open(output_file, "w") as f: f.write(s) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evaluate UBM ELBO') - - parser.add_argument('--seq-file', dest='seq_file', required=True) - parser.add_argument('--file-list', dest='file_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - parser.add_argument('--model-file', dest='model_file', required=True) - parser.add_argument('--output-file', dest='output_file', required=True) - parser.add_argument('--ubm-type', dest='ubm_type', default='diag-gmm', - choices=['diag-gmm', 'kaldi-diag-gmm']) - + fromfile_prefix_chars="@", + description="Evaluate UBM ELBO", + ) + + parser.add_argument("--seq-file", dest="seq_file", required=True) + parser.add_argument("--file-list", dest="file_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) + parser.add_argument("--model-file", dest="model_file", required=True) + parser.add_argument("--output-file", dest="output_file", required=True) + parser.add_argument( + "--ubm-type", + dest="ubm_type", + default="diag-gmm", + choices=["diag-gmm", "kaldi-diag-gmm"], + ) + SR.add_argparse_eval_args(parser) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) - args=parser.parse_args() + args = parser.parse_args() config_logger(args.verbose) del args.verbose - - eval_elbo(**vars(args)) + eval_elbo(**vars(args)) diff --git a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py index 1c506ec8..9e2880f8 100755 --- a/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/eval-q-scoring-homo-gbe.py @@ -24,11 +24,18 @@ from hyperion.classifiers import QScoringHomoGBE as GBE -def eval_qscoring_gbe(iv_file, class2int_file, test_file, - preproc_file, - model_file, score_file, vector_score_file, - normalize, **kwargs): - +def eval_qscoring_gbe( + iv_file, + class2int_file, + test_file, + preproc_file, + model_file, + score_file, + vector_score_file, + normalize, + **kwargs +): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -39,46 +46,49 @@ def eval_qscoring_gbe(iv_file, class2int_file, test_file, x, ndx = tdr.read() model = GBE.load(model_file) - + t1 = time.time() scores = model.predict(x, normalize) - + dt = time.time() - t1 - num_trials = scores.shape[0]*scores.shape[1] - logging.info('Elapsed time: %.2f s. Elapsed time per trial: %.2f ms.' - % (dt, dt/num_trials*1000)) + num_trials = scores.shape[0] * scores.shape[1] + logging.info( + "Elapsed time: %.2f s. Elapsed time per trial: %.2f ms." + % (dt, dt / num_trials * 1000) + ) s = TrialScores(ndx.model_set, ndx.seg_set, scores.T) s.save(score_file) if vector_score_file is not None: h5 = HDW(vector_score_file) - h5.write(ndx.seg_set, '', scores) + h5.write(ndx.seg_set, "", scores) + - if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Evals Q-scoring back-end') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Evals Q-scoring back-end", + ) + + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--class2int-file", dest="class2int_file", required=True) + parser.add_argument("--test-file", dest="test_file", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--class2int-file', dest='class2int_file', required=True) - parser.add_argument('--test-file', dest='test_file', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) - TDR.add_argparse_args(parser) GBE.add_argparse_eval_args(parser) - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--vector-score-file', dest='vector_score_file', default=None) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--vector-score-file", dest="vector_score_file", default=None) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_qscoring_gbe(**vars(args)) - + eval_qscoring_gbe(**vars(args)) diff --git a/hyperion/bin_deprec/eval-score-norm.py b/hyperion/bin_deprec/eval-score-norm.py index f13ca647..fd6e2e00 100755 --- a/hyperion/bin_deprec/eval-score-norm.py +++ b/hyperion/bin_deprec/eval-score-norm.py @@ -21,14 +21,13 @@ from hyperion.utils.trial_ndx import TrialNdx - def load_scores(score_file, enr_coh_file, coh_test_file, coh_coh_file): scores = TrialScores.load(score_file) scores_enr_coh = None scores_coh_test = None scores_coh_coh = None - + if enr_coh_file is not None: ndx = TrialNdx(scores.model_set, scores_enr_coh.seg_set) scores_enr_coh = TrialScores.load(enr_coh_file) @@ -48,69 +47,89 @@ def load_scores(score_file, enr_coh_file, coh_test_file, coh_coh_file): return scores, scores_enr_coh, scores_coh_test, scores_coh_coh - -def score_norm(score_file, ouput_file, norm_type, - enr_coh_file=None, coh_test_file=None, coh_coh_file=None, - adapt_coh=None): +def score_norm( + score_file, + ouput_file, + norm_type, + enr_coh_file=None, + coh_test_file=None, + coh_coh_file=None, + adapt_coh=None, +): scores, scores_enr_coh, scores_coh_test, scores_coh_coh = load_scores( - score_file, enr_coh_file, coh_test_file, coh_coh_file) + score_file, enr_coh_file, coh_test_file, coh_coh_file + ) - if norm_type == 't_norm': + if norm_type == "t_norm": assert scores_coh_test is not None norm = TNorm() scores_norm = norm.predict(scores.scores, scores_coh_test.scores) - if norm_type == 'z_norm': + if norm_type == "z_norm": assert scores_enr_coh is not None norm = ZNorm() scores_norm = norm.predict(scores.scores, scores_enr_coh.scores) - if norm_type == 'zt_norm': - assert(scores_enr_coh is not None and scores_coh_test is not None - and scores_coh_coh is not None) + if norm_type == "zt_norm": + assert ( + scores_enr_coh is not None + and scores_coh_test is not None + and scores_coh_coh is not None + ) norm = ZTNorm() - scores_norm = norm.predict(scores.scores, - scores_coh_test.scores, - scores_enr_coh.scores, - scores_coh_coh.scores) - - if norm_type == 's_norm': - assert(scores_enr_coh is not None and scores_coh_test is not None - and scores_coh_coh is not None) + scores_norm = norm.predict( + scores.scores, + scores_coh_test.scores, + scores_enr_coh.scores, + scores_coh_coh.scores, + ) + + if norm_type == "s_norm": + assert ( + scores_enr_coh is not None + and scores_coh_test is not None + and scores_coh_coh is not None + ) norm = SNorm() - scores_norm = norm.predict(scores.scores, - scores_coh_test.scores, - scores_enr_coh.scores, - scores_coh_coh.scores) + scores_norm = norm.predict( + scores.scores, + scores_coh_test.scores, + scores_enr_coh.scores, + scores_coh_coh.scores, + ) scores.scores = scores_norm scores.save(ouput_file) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Eval score normalization') - - parser.add_argument('--score-file', dest='score_file', required=True) - parser.add_argument('--output-file', dest='output_file', required=True) - parser.add_argument('--enr-coh-file', dest='enr_coh_file', default=None) - parser.add_argument('--coh-test-file', dest='coh_test_file', default=None) - parser.add_argument('--coh-coh-file', dest='coh_coh_file', default=None) - parser.add_argument('--norm-type', dest='norm_type', default='s-norm', - choices=['t-norm', 'z-norm', 'zt-norm', 's-norm']) - parser.add_argument('--adapt-coh', dest='adapt_coh', default=None, type=int) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Eval score normalization", + ) + + parser.add_argument("--score-file", dest="score_file", required=True) + parser.add_argument("--output-file", dest="output_file", required=True) + parser.add_argument("--enr-coh-file", dest="enr_coh_file", default=None) + parser.add_argument("--coh-test-file", dest="coh_test_file", default=None) + parser.add_argument("--coh-coh-file", dest="coh_coh_file", default=None) + parser.add_argument( + "--norm-type", + dest="norm_type", + default="s-norm", + choices=["t-norm", "z-norm", "zt-norm", "s-norm"], + ) + parser.add_argument("--adapt-coh", dest="adapt_coh", default=None, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - eval_score_norm(**vars(args)) - + eval_score_norm(**vars(args)) diff --git a/hyperion/bin_deprec/h5vad2nist.py b/hyperion/bin_deprec/h5vad2nist.py index a1b53af7..804c8637 100755 --- a/hyperion/bin_deprec/h5vad2nist.py +++ b/hyperion/bin_deprec/h5vad2nist.py @@ -18,10 +18,11 @@ from hyperion.hyp_defs import config_logger from hyperion.io import HypDataReader + def bin2intervals(vad): delta = np.abs(np.diff(vad)) - change_points = np.where(delta>0)[0] - num_interv = len(change_points)+1 + change_points = np.where(delta > 0)[0] + num_interv = len(change_points) + 1 if vad[0] == 1: speech = True else: @@ -55,38 +56,49 @@ def write_opensat(file_vad, key, vad): file_dir = os.path.dirname(file_vad) if not os.path.exists(file_dir): os.makedirs(file_dir) - with open(file_vad, 'w') as f: + with open(file_vad, "w") as f: start, stop, state, conf = bin2intervals(vad) for i in range(len(start)): - f.write('X\tX\tX\tSAD\t%s\t%.2f\t%.2f\t%s\t%.2f\n' % (key, start[i], stop[i], 'speech' if state[i] else 'non-speech', conf[i])) - + f.write( + "X\tX\tX\tSAD\t%s\t%.2f\t%.2f\t%s\t%.2f\n" + % ( + key, + start[i], + stop[i], + "speech" if state[i] else "non-speech", + conf[i], + ) + ) + def h5vad2nist(input_file, output_dir): r = HypDataReader(input_file) keys = r.get_datasets() - + for key in keys: X = r.read([key]) - file_vad = output_dir + '/' + key + '.txt' + file_vad = output_dir + "/" + key + ".txt" write_opensat(file_vad, key, X[0]) if __name__ == "__main__": - - parser=argparse.ArgumentParser( + + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Converts from VAD hdf5 file to NIST OpenSAT format') - - parser.add_argument('--input-file',dest='input_file', required=True) - parser.add_argument('--output-dir', dest='output_dir', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + fromfile_prefix_chars="@", + description="Converts from VAD hdf5 file to NIST OpenSAT format", + ) + + parser.add_argument("--input-file", dest="input_file", required=True) + parser.add_argument("--output-dir", dest="output_dir", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - + h5vad2nist(**vars(args)) - diff --git a/hyperion/bin_deprec/init-ubm.py b/hyperion/bin_deprec/init-ubm.py index 17ea7622..8a162314 100755 --- a/hyperion/bin_deprec/init-ubm.py +++ b/hyperion/bin_deprec/init-ubm.py @@ -26,57 +26,56 @@ @threadsafe_generator def data_generator(sr, max_length): - kk=0 + kk = 0 while 1: - kk+=1 + kk += 1 x, sample_weights = sr.read(return_3d=True, max_seq_length=max_length) return_sw = True - if sr.max_batch_seq_length==max_length and ( - sr.min_seq_length==sr.max_seq_length or - np.min(sr.seq_length)==sr.max_seq_length): + if sr.max_batch_seq_length == max_length and ( + sr.min_seq_length == sr.max_seq_length + or np.min(sr.seq_length) == sr.max_seq_length + ): return_sw = False - + if return_sw: yield (x, x, sample_weights) else: yield (x, x) - -def init_ubm(seq_file, train_list, x_dim, num_comp, - output_path, **kwargs): + +def init_ubm(seq_file, train_list, x_dim, num_comp, output_path, **kwargs): if seq_file is None: model = DiagGMM(x_dim=x_dim, num_comp=1) model.initialize() model.save(output_path) - sr_args = SR.filter_args(**kwargs) sr = SR(seq_file, train_list, batch_size=1, **sr_args) - if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Initializes UBM') - - parser.add_argument('--seq-file', dest='seq_file', default=None) - parser.add_argument('--train-list', dest='train_list', default=None) - parser.add_argument('--x-dim', dest='x_dim', type=int, required=True) - parser.add_argument('--num-comp', dest='num_comp', default=1) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - + fromfile_prefix_chars="@", + description="Initializes UBM", + ) + + parser.add_argument("--seq-file", dest="seq_file", default=None) + parser.add_argument("--train-list", dest="train_list", default=None) + parser.add_argument("--x-dim", dest="x_dim", type=int, required=True) + parser.add_argument("--num-comp", dest="num_comp", default=1) + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + SR.add_argparse_args(parser) - - args=parser.parse_args() + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - init_ubm(**vars(args)) - + init_ubm(**vars(args)) diff --git a/hyperion/bin_deprec/scores2lre_format.py b/hyperion/bin_deprec/scores2lre_format.py index 571cf741..50e9147f 100755 --- a/hyperion/bin_deprec/scores2lre_format.py +++ b/hyperion/bin_deprec/scores2lre_format.py @@ -17,6 +17,7 @@ from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores + def convert(input_file, output_file, test_list, class_file, add_ext): scores = TrialScores.load(input_file) @@ -24,9 +25,12 @@ def convert(input_file, output_file, test_list, class_file, add_ext): if test_list is None: seg_set = scores.seg_set else: - with open(test_list, 'r') as f: - seg_set = [ seg for seg in [line.rstrip().split(' ')[0] for line in f] - if seg!='segmentid'] + with open(test_list, "r") as f: + seg_set = [ + seg + for seg in [line.rstrip().split(" ")[0] for line in f] + if seg != "segmentid" + ] if add_ext: exts = [os.path.splitext(seg)[1] for seg in seg_set] seg_set = [os.path.splitext(seg)[0] for seg in seg_set] @@ -34,46 +38,47 @@ def convert(input_file, output_file, test_list, class_file, add_ext): if class_file is None: model_set = scores.model_set else: - with open(class_file, 'r') as f: + with open(class_file, "r") as f: model_set = [line.rstrip().split()[0] for line in f] ndx = TrialNdx(model_set, seg_set) scores = scores.set_missing_to_value(ndx, -100) if add_ext: - scores.seg_set = [seg+ext for seg, ext in zip(scores.seg_set, exts)] - - with open(output_file, 'w') as f: - f.write('segmentid\t') + scores.seg_set = [seg + ext for seg, ext in zip(scores.seg_set, exts)] + + with open(output_file, "w") as f: + f.write("segmentid\t") for model in scores.model_set[:-1]: - f.write('%s\t' % model) - f.write('%s\n' % scores.model_set[-1]) + f.write("%s\t" % model) + f.write("%s\n" % scores.model_set[-1]) for i in range(scores.scores.shape[1]): - f.write('%s\t' % scores.seg_set[i]) - for j in range(scores.scores.shape[0]-1): - f.write('%f\t' % scores.scores[j, i]) - f.write('%f\n' % scores.scores[-1, i]) - - + f.write("%s\t" % scores.seg_set[i]) + for j in range(scores.scores.shape[0] - 1): + f.write("%f\t" % scores.scores[j, i]) + f.write("%f\n" % scores.scores[-1, i]) + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Convert scores to LRE format') - - parser.add_argument('--input-file', dest='input_file', required=True) - parser.add_argument('--output-file', dest='output_file', required=True) - parser.add_argument('--test-list', dest='test_list', default=None) - parser.add_argument('--class-file', dest='class_file', default=None) - parser.add_argument('--add-ext', dest='add_ext', default=False, action='store_true') - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Convert scores to LRE format", + ) + + parser.add_argument("--input-file", dest="input_file", required=True) + parser.add_argument("--output-file", dest="output_file", required=True) + parser.add_argument("--test-list", dest="test_list", default=None) + parser.add_argument("--class-file", dest="class_file", default=None) + parser.add_argument("--add-ext", dest="add_ext", default=False, action="store_true") + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - convert(**vars(args)) + convert(**vars(args)) diff --git a/hyperion/bin_deprec/torch-train-xvector.py b/hyperion/bin_deprec/torch-train-xvector.py index 77c5d7b7..4cc443ae 100755 --- a/hyperion/bin_deprec/torch-train-xvector.py +++ b/hyperion/bin_deprec/torch-train-xvector.py @@ -25,34 +25,49 @@ from hyperion.torch.metrics import CategoricalAccuracy -def train_xvector(data_path, train_list, val_list, - encoder_net, classif_net, preproc_net, loss, - exp_path, epochs, resume, resume_path, num_gpus, seed, **kwargs): +def train_xvector( + data_path, + train_list, + val_list, + encoder_net, + classif_net, + preproc_net, + loss, + exp_path, + epochs, + resume, + resume_path, + num_gpus, + seed, + **kwargs +): device = open_device(num_gpus=num_gpus) set_float_cpu(float_torch()) torch.manual_seed(seed) - opt_args = OF.filter_args(prefix='opt', **kwargs) - lrsch_args = LRSF.filter_args(prefix='lrsch', **kwargs) - pool_cfg = PF.filter_args(prefix='pool', **kwargs) - dataset_args = SeqDataset.filter_args(prefix='data', **kwargs) - sampler_args = Sampler.filter_args(prefix='data', **kwargs) + opt_args = OF.filter_args(prefix="opt", **kwargs) + lrsch_args = LRSF.filter_args(prefix="lrsch", **kwargs) + pool_cfg = PF.filter_args(prefix="pool", **kwargs) + dataset_args = SeqDataset.filter_args(prefix="data", **kwargs) + sampler_args = Sampler.filter_args(prefix="data", **kwargs) train_data = SeqDataset(data_path, train_list, **dataset_args) train_sampler = Sampler(train_data, **sampler_args) val_data = SeqDataset(data_path, val_list, **dataset_args) val_sampler = Sampler(val_data, **sampler_args) - train_loader = DataLoader(train_data, batch_sampler=train_sampler, - num_workers=num_workers) - val_loader = DataLoader(val_data, batch_sampler=val_sampler, - num_workers=num_workers) - + train_loader = DataLoader( + train_data, batch_sampler=train_sampler, num_workers=num_workers + ) + val_loader = DataLoader( + val_data, batch_sampler=val_sampler, num_workers=num_workers + ) + optimizer = OF.create(model.parameters(), **opt_args) lr_sch = LRSF.create(optimizer, **lrsch_args) loss = nn.CrossEntropyLoss() - metrics = { 'acc': CategoricalAccuracy() } + metrics = {"acc": CategoricalAccuracy()} if preproc_net is not None: preproc_net = TorchNALoader.load(preproc_net) @@ -60,8 +75,17 @@ def train_xvector(data_path, train_list, val_list, classif_net = TorchNALoader.load(classif_net) model = XVector(encoder_net, pool_cfg, classif_net, preproc_net=preproc_net) - trainer = XVectorTrainer(model, optimizer, loss, epochs, exp_path, device=device, metrics=metrics, lr_scheduler=lr_sch) - + trainer = XVectorTrainer( + model, + optimizer, + loss, + epochs, + exp_path, + device=device, + metrics=metrics, + lr_scheduler=lr_sch, + ) + if resume: if resume_path is not None: trainer.load_checkpoint(resume_path) @@ -70,51 +94,59 @@ def train_xvector(data_path, train_list, val_list, trainer.fit(train_loader, val_loader) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Train x-vectors') - - parser.add_argument('--data-path', dest='data_path', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--val-list', dest='val_list', default=None) - SeqDataset.add_argparse_args(parser, prefix='data') - Sampler.add_argparse_args(parser, prefix='data') - - parser.add_argument('--encoder-net', dest='encoder_net', required=True) - parser.add_argument('--classif-net', dest='classif_net', required=True) - parser.add_argument('--preproc-net', dest='preproc_net', required=True) - PF.add_argparse_args(parser, prefix='pool') - - OF.add_argparse_args(parser, prefix='opt') - LRSF.add_argparse_args(parser, prefix='lrsch') - - parser.add_argument('--num-gpus', action='num_gpus', default=1, - help='number of gpus, if 0 use cpu') - parser.add_argument('--seed', type=int, default=1024, - help='random seed') - - # parser.add_argument('--log-interval', type=int, default=10, + fromfile_prefix_chars="@", + description="Train x-vectors", + ) + + parser.add_argument("--data-path", dest="data_path", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--val-list", dest="val_list", default=None) + SeqDataset.add_argparse_args(parser, prefix="data") + Sampler.add_argparse_args(parser, prefix="data") + + parser.add_argument("--encoder-net", dest="encoder_net", required=True) + parser.add_argument("--classif-net", dest="classif_net", required=True) + parser.add_argument("--preproc-net", dest="preproc_net", required=True) + PF.add_argparse_args(parser, prefix="pool") + + OF.add_argparse_args(parser, prefix="opt") + LRSF.add_argparse_args(parser, prefix="lrsch") + + parser.add_argument( + "--num-gpus", action="num_gpus", default=1, help="number of gpus, if 0 use cpu" + ) + parser.add_argument("--seed", type=int, default=1024, help="random seed") + + # parser.add_argument('--log-interval', type=int, default=10, # help='how many batches to wait before logging training status') - parser.add_argument('--resume', action='store_true', default=False, - help='resume training from checkpoint') - parser.add_argument('--resume-path', default=None, - help='checkpoint path, if none it uses the last checkpoint in exp_path') - - parser.add_argument('--exp-path', help='experiment path') - - parser.add_argument('--epochs', dest='epochs', default=1000, type=int) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument( + "--resume", + action="store_true", + default=False, + help="resume training from checkpoint", + ) + parser.add_argument( + "--resume-path", + default=None, + help="checkpoint path, if none it uses the last checkpoint in exp_path", + ) + + parser.add_argument("--exp-path", help="experiment path") + + parser.add_argument("--epochs", dest="epochs", default=1000, type=int) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_xvectors(**vars(args)) - + train_xvectors(**vars(args)) diff --git a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py index 8411084b..69780865 100755 --- a/hyperion/bin_deprec/train-q-scoring-homo-gbe.py +++ b/hyperion/bin_deprec/train-q-scoring-homo-gbe.py @@ -21,9 +21,8 @@ from hyperion.classifiers import QScoringHomoGBE as GBE -def train_qscoring_backend(iv_file, train_list, preproc_file, - output_path, **kwargs): - +def train_qscoring_backend(iv_file, train_list, preproc_file, output_path, **kwargs): + if preproc_file is not None: preproc = TransformList.load(preproc_file) else: @@ -38,34 +37,34 @@ def train_qscoring_backend(iv_file, train_list, preproc_file, model_args = GBE.filter_train_args(**kwargs) model = GBE(**model_args) model.fit(x, class_ids) - logging.info('Elapsed time: %.2f s.' % (time.time()-t1)) + logging.info("Elapsed time: %.2f s." % (time.time() - t1)) model.save(output_path) - - + if __name__ == "__main__": - parser=argparse.ArgumentParser( + parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Trains Q-scoring back-end') + fromfile_prefix_chars="@", + description="Trains Q-scoring back-end", + ) - parser.add_argument('--iv-file', dest='iv_file', required=True) - parser.add_argument('--train-list', dest='train_list', required=True) - parser.add_argument('--preproc-file', dest='preproc_file', default=None) + parser.add_argument("--iv-file", dest="iv_file", required=True) + parser.add_argument("--train-list", dest="train_list", required=True) + parser.add_argument("--preproc-file", dest="preproc_file", default=None) VCR.add_argparse_args(parser) GBE.add_argparse_train_args(parser) - parser.add_argument('--output-path', dest='output_path', required=True) - parser.add_argument('-v', '--verbose', dest='verbose', default=1, choices=[0, 1, 2, 3], type=int) - - args=parser.parse_args() + parser.add_argument("--output-path", dest="output_path", required=True) + parser.add_argument( + "-v", "--verbose", dest="verbose", default=1, choices=[0, 1, 2, 3], type=int + ) + + args = parser.parse_args() config_logger(args.verbose) del args.verbose logging.debug(args) - - train_qscoring_backend(**vars(args)) - + train_qscoring_backend(**vars(args)) diff --git a/hyperion/bin_deprec/vectors2scores.py b/hyperion/bin_deprec/vectors2scores.py index e72a5333..cc936115 100755 --- a/hyperion/bin_deprec/vectors2scores.py +++ b/hyperion/bin_deprec/vectors2scores.py @@ -15,30 +15,31 @@ from hyperion.io import SequentialDataReaderFactory as DRF from hyperion.utils.trial_scores import TrialScores + def convert(input_file, output_file, class_file): r = DRF.create(input_file) seg_set, score_mat = r.read(0, squeeze=True) - with open(class_file, 'r') as f: + with open(class_file, "r") as f: model_set = [line.rstrip().split()[0] for line in f] scores = TrialScores(model_set, seg_set, score_mat.T) scores.save(output_file) - + if __name__ == "__main__": - parser=argparse.ArgumentParser( - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - fromfile_prefix_chars='@', - description='Converts scores from vector format to TrialScores format') + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + fromfile_prefix_chars="@", + description="Converts scores from vector format to TrialScores format", + ) - parser.add_argument('--input-file', dest='input_file', required=True) - parser.add_argument('--output-file', dest='output_file', required=True) - parser.add_argument('--class-file', dest='class_file', default=None) - - args=parser.parse_args() + parser.add_argument("--input-file", dest="input_file", required=True) + parser.add_argument("--output-file", dest="output_file", required=True) + parser.add_argument("--class-file", dest="class_file", default=None) - convert(**vars(args)) + args = parser.parse_args() + convert(**vars(args)) diff --git a/hyperion/calibration/__init__.py b/hyperion/calibration/__init__.py index 8f95096b..cddef567 100644 --- a/hyperion/calibration/__init__.py +++ b/hyperion/calibration/__init__.py @@ -2,7 +2,3 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ - - - - diff --git a/hyperion/calibration/gauss_calibration.py b/hyperion/calibration/gauss_calibration.py index ae2160cb..b374f7e5 100644 --- a/hyperion/calibration/gauss_calibration.py +++ b/hyperion/calibration/gauss_calibration.py @@ -7,9 +7,10 @@ from ..hyp_model import HypModel + class GaussCalibration(HypModel): - """Class for supervised Gaussian calibration - """ + """Class for supervised Gaussian calibration""" + def __init__(self, mu1=None, mu2=None, sigma2=None, prior=0.5, **kwargs): super(GaussCalibration, self).__init__(**kwargs) self.mu1 = mu1 @@ -21,55 +22,54 @@ def __init__(self, mu1=None, mu2=None, sigma2=None, prior=0.5, **kwargs): if self.is_init(): self._compute_scale_bias() - def is_init(self): return self.mu1 is not None and self.mu2 is not None and self.sigma2 is not None - def _compute_scale_bias(self): - self.a = (self.mu1 - self.mu2)/self.sigma2 - self.b = 0.5*(self.mu2**2 - self.mu1**2)/self.sigma2 - + self.a = (self.mu1 - self.mu2) / self.sigma2 + self.b = 0.5 * (self.mu2 ** 2 - self.mu1 ** 2) / self.sigma2 - def fit(self, x, y, sample_weight=None): - non = x[y==0] - tar = x[y==1] + non = x[y == 0] + tar = x[y == 1] if sample_weight is None: sw_tar = 1 sw_non = 1 sample_weight = 1 - self.prior = float(len(tar))/len(x) + self.prior = float(len(tar)) / len(x) else: - sw_non = sample_weight[y==0] - sw_tar = sample_weight[y==1] - self.prior = np.sum(sw_tar)/np.sum(sample_weight) - - self.mu1 = np.mean(sw_tar*tar)/np.mean(sw_tar) - self.mu2 = np.mean(sw_non*non)/np.mean(sw_non) + sw_non = sample_weight[y == 0] + sw_tar = sample_weight[y == 1] + self.prior = np.sum(sw_tar) / np.sum(sample_weight) + + self.mu1 = np.mean(sw_tar * tar) / np.mean(sw_tar) + self.mu2 = np.mean(sw_non * non) / np.mean(sw_non) + + self.sigma2 = ( + ( + np.sum(sw_tar * (tar - self.mu1) ** 2) + + np.sum(sw_non * (non - self.mu2) ** 2) + ) + / len(x) + / np.mean(sample_weight) + ) - self.sigma2 = (np.sum(sw_tar*(tar - self.mu1)**2) + np.sum(sw_non*(non - self.mu2)**2))/len(x)/np.mean(sample_weight) - self._compute_scale_bias() - - def predict(self, x): - return self.a*x+self.b - + return self.a * x + self.b def save_params(self, f): - params = {'mu1': self.mu1, - 'mu2': self.mu2, - 'sigma2': self.sigma2} + params = {"mu1": self.mu1, "mu2": self.mu2, "sigma2": self.sigma2} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['mu1', 'mu2', 'sigma2'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(mu1=params['mu1'], mu2=params['mu2'], sigma2=config['sigma2'], name=config['name']) - - + param_list = ["mu1", "mu2", "sigma2"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + mu1=params["mu1"], + mu2=params["mu2"], + sigma2=config["sigma2"], + name=config["name"], + ) diff --git a/hyperion/calibration/unsup_gauss_calibration.py b/hyperion/calibration/unsup_gauss_calibration.py index b9ac5c32..224d0cd0 100644 --- a/hyperion/calibration/unsup_gauss_calibration.py +++ b/hyperion/calibration/unsup_gauss_calibration.py @@ -8,15 +8,16 @@ from ..pdfs.mixtures.diag_gmm_tiedcovs import DiagGMMTiedCovs as GMM from .gauss_calibration import GaussCalibration + class UnsupGaussCalibration(GaussCalibration): - """Class for unsupervised Gaussian calibration. - """ + """Class for unsupervised Gaussian calibration.""" - def __init__(self, mu1=None, mu2=None, sigma2=None, prior=0.5, init_prior=0.5, **kwargs): + def __init__( + self, mu1=None, mu2=None, sigma2=None, prior=0.5, init_prior=0.5, **kwargs + ): super(UnsupGaussCalibration, self).__init__(mu1, mu2, sigma2, prior, **kwargs) self.init_prior = init_prior - def fit(self, x): if x.ndim == 1: @@ -26,24 +27,20 @@ def fit(self, x): mu1 = self.mu1 mu2 = self.mu2 sigma2 = np.expand_dims(self.sigma2, axis=-1) - pi = np.array([self.prior, 1-self.prior]) + pi = np.array([self.prior, 1 - self.prior]) else: mu1 = np.max(x, axis=0, keepdims=True) mu2 = np.mean(x, axis=0, keepdims=True) - sigma2 = np.std(x, axis=0, keepdims=True)**2 - pi = np.array([self.init_prior, 1-self.init_prior]) + sigma2 = np.std(x, axis=0, keepdims=True) ** 2 + pi = np.array([self.init_prior, 1 - self.init_prior]) mu = np.vstack((mu1, mu2)) - gmm = GMM(mu=mu, Lambda=1/sigma2, pi=pi) + gmm = GMM(mu=mu, Lambda=1 / sigma2, pi=pi) gmm.fit(x, epochs=20) - self.mu1 = gmm.mu[0,0] - self.mu2 = gmm.mu[1,0] + self.mu1 = gmm.mu[0, 0] + self.mu2 = gmm.mu[1, 0] self.sigma2 = gmm.Sigma[0] self.prior = gmm.pi[0] - - self._compute_scale_bias() - - - + self._compute_scale_bias() diff --git a/hyperion/classifiers/__init__.py b/hyperion/classifiers/__init__.py index 158cb140..07da0af8 100644 --- a/hyperion/classifiers/__init__.py +++ b/hyperion/classifiers/__init__.py @@ -10,4 +10,3 @@ from .greedy_fusion import GreedyFusionBinaryLR from .linear_svmc import LinearSVMC from .q_scoring_homo_gbe import QScoringHomoGBE - diff --git a/hyperion/classifiers/binary_logistic_regression.py b/hyperion/classifiers/binary_logistic_regression.py index ed4b74bb..fac76485 100644 --- a/hyperion/classifiers/binary_logistic_regression.py +++ b/hyperion/classifiers/binary_logistic_regression.py @@ -6,125 +6,166 @@ from .logistic_regression import LogisticRegression -class BinaryLogisticRegression(LogisticRegression): - def __init__(self, A=None, b=None, penalty='l2', lambda_reg=1e-6, - use_bias=True, bias_scaling=1, prior=0.5, - random_state=None, solver='liblinear', max_iter=100, - dual=False, tol=0.0001, verbose=0, warm_start=True, - lr_seed=1024, **kwargs): - - priors = {0:1-prior, 1:prior} +class BinaryLogisticRegression(LogisticRegression): + def __init__( + self, + A=None, + b=None, + penalty="l2", + lambda_reg=1e-6, + use_bias=True, + bias_scaling=1, + prior=0.5, + random_state=None, + solver="liblinear", + max_iter=100, + dual=False, + tol=0.0001, + verbose=0, + warm_start=True, + lr_seed=1024, + **kwargs + ): + + priors = {0: 1 - prior, 1: prior} super(BinaryLogisticRegression, self).__init__( - A=A, b=b, penalty=penalty, lambda_reg=lambda_reg, - use_bias=use_bias, bias_scaling=bias_scaling, priors=priors, - random_state=random_state, solver=solver, max_iter=max_iter, - dual=dual, tol=tol, verbose=verbose, warm_start=warm_start, - multi_class='ovr', lr_seed=1024, **kwargs) - - + A=A, + b=b, + penalty=penalty, + lambda_reg=lambda_reg, + use_bias=use_bias, + bias_scaling=bias_scaling, + priors=priors, + random_state=random_state, + solver=solver, + max_iter=max_iter, + dual=dual, + tol=tol, + verbose=verbose, + warm_start=warm_start, + multi_class="ovr", + lr_seed=1024, + **kwargs + ) @property def prior(self): return self.priors[1] - - def get_config(self): - config = {'prior': self.prior } + config = {"prior": self.prior} base_config = super(BinaryLogisticRegression, self).get_config() - del base_config['priors'] + del base_config["priors"] return dict(list(base_config.items()) + list(config.items())) - - - def predict(self, x, eval_type='logit'): + def predict(self, x, eval_type="logit"): if x.ndim == 1: x = x[:, None] - - y = np.dot(x, self.A).ravel() + self.b - if eval_type == 'log-post': - y = - np.log(1+np.exp(-(y+np.log(self.prior/(1-self.prior))))) - if eval_type == 'post': - y = 1/(1+np.exp(-(y+np.log(self.prior/(1-self.prior))))) - - return y + y = np.dot(x, self.A).ravel() + self.b + if eval_type == "log-post": + y = -np.log(1 + np.exp(-(y + np.log(self.prior / (1 - self.prior))))) + if eval_type == "post": + y = 1 / (1 + np.exp(-(y + np.log(self.prior / (1 - self.prior))))) + return y @staticmethod def filter_train_args(**kwargs): - valid_args = ('penalty', 'lambda_reg', - 'use_bias', 'bias_scaling', 'no_use_bias', - 'prior', 'lr_seed', - 'solver', 'max_iter', - 'dual', 'tol', 'verbose', - 'warm_start', 'no_warm_start', 'name') - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - if 'no_use_bias' in d: - d['use_bias'] = not d['no_use_bias'] - if 'no_warm_start' in d: - d['warm_start'] = not d['no_warm_start'] - - return d + valid_args = ( + "penalty", + "lambda_reg", + "use_bias", + "bias_scaling", + "no_use_bias", + "prior", + "lr_seed", + "solver", + "max_iter", + "dual", + "tol", + "verbose", + "warm_start", + "no_warm_start", + "name", + ) + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + if "no_use_bias" in d: + d["use_bias"] = not d["no_use_bias"] + if "no_warm_start" in d: + d["warm_start"] = not d["no_warm_start"] + return d - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'penalty', - default='l2', choices=['l2', 'l1'], - help='used to specify the norm used in the penalization') - parser.add_argument(p1+'lambda-reg', - default=1e-5, type=float, - help='regularization strength') - parser.add_argument(p1+'no-use-bias', - default=False, action='store_true', - help='Not use bias') - parser.add_argument(p1+'bias-scaling', - default=1.0, type=float, - help='useful only when the solver liblinear is used and use_bias is set to True') - parser.add_argument(p1+'lr-seed', - default=1024, type=int, - help='random number generator seed') - parser.add_argument(p1+'solver', - default='lbfgs', choices=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], - help='type of solver') - parser.add_argument(p1+'max-iter', - default=100, type=int, - help='only for the newton-cg, sag and lbfgs solvers') - parser.add_argument(p1+'dual', - default=False, action='store_true', - help=('dual or primal formulation. ' - 'Dual formulation is only implemented for l2 penalty with liblinear solver')) - parser.add_argument(p1+'tol', default=1e-4, type=float, - help='tolerance for stopping criteria') - parser.add_argument(p1+'verbose', - default=0, type=int, - help='For the liblinear and lbfgs solvers') - parser.add_argument(p1+'no-warm-start', - default=False, action='store_true', - help='don\'t use previous model to start') - - parser.add_argument(p1+'prior', - default=0.1, type=float, - help='Target prior') - - parser.add_argument(p1+'name', - default='lr', - help='model name') - - add_argparse_args = add_class_args - - - - - - + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "penalty", + default="l2", + choices=["l2", "l1"], + help="used to specify the norm used in the penalization", + ) + parser.add_argument( + p1 + "lambda-reg", default=1e-5, type=float, help="regularization strength" + ) + parser.add_argument( + p1 + "no-use-bias", default=False, action="store_true", help="Not use bias" + ) + parser.add_argument( + p1 + "bias-scaling", + default=1.0, + type=float, + help="useful only when the solver liblinear is used and use_bias is set to True", + ) + parser.add_argument( + p1 + "lr-seed", default=1024, type=int, help="random number generator seed" + ) + parser.add_argument( + p1 + "solver", + default="lbfgs", + choices=["newton-cg", "lbfgs", "liblinear", "sag", "saga"], + help="type of solver", + ) + parser.add_argument( + p1 + "max-iter", + default=100, + type=int, + help="only for the newton-cg, sag and lbfgs solvers", + ) + parser.add_argument( + p1 + "dual", + default=False, + action="store_true", + help=( + "dual or primal formulation. " + "Dual formulation is only implemented for l2 penalty with liblinear solver" + ), + ) + parser.add_argument( + p1 + "tol", default=1e-4, type=float, help="tolerance for stopping criteria" + ) + parser.add_argument( + p1 + "verbose", + default=0, + type=int, + help="For the liblinear and lbfgs solvers", + ) + parser.add_argument( + p1 + "no-warm-start", + default=False, + action="store_true", + help="don't use previous model to start", + ) + + parser.add_argument(p1 + "prior", default=0.1, type=float, help="Target prior") + + parser.add_argument(p1 + "name", default="lr", help="model name") + + add_argparse_args = add_class_args diff --git a/hyperion/classifiers/greedy_fusion.py b/hyperion/classifiers/greedy_fusion.py index 7e8103f6..338bc8d7 100644 --- a/hyperion/classifiers/greedy_fusion.py +++ b/hyperion/classifiers/greedy_fusion.py @@ -12,15 +12,29 @@ from .binary_logistic_regression import BinaryLogisticRegression as BLR -class GreedyFusionBinaryLR(HypModel): - def __init__(self, weights=None, bias=None, - system_idx=None, system_names=None, max_systems=None, - prioritize_positive=True, - penalty='l2', lambda_reg=1e-6, - bias_scaling=1, prior=0.5, prior_eval=None, - solver='liblinear', max_iter=100, - dual=False, tol=0.0001, verbose=0, lr_seed=1024, **kwargs): +class GreedyFusionBinaryLR(HypModel): + def __init__( + self, + weights=None, + bias=None, + system_idx=None, + system_names=None, + max_systems=None, + prioritize_positive=True, + penalty="l2", + lambda_reg=1e-6, + bias_scaling=1, + prior=0.5, + prior_eval=None, + solver="liblinear", + max_iter=100, + dual=False, + tol=0.0001, + verbose=0, + lr_seed=1024, + **kwargs + ): super(GreedyFusionBinaryLR, self).__init__(**kwargs) @@ -34,39 +48,43 @@ def __init__(self, weights=None, bias=None, self.prior_eval = prior else: self.prior_eval = prior_eval - - self.lr = BLR(penalty=penalty, lambda_reg=lambda_reg, - use_bias=True, bias_scaling=bias_scaling, - prior=prior, solver=solver, max_iter=max_iter, - dual=dual, tol=tol, verbose=verbose, warm_start=False, - lr_seed=lr_seed) + self.lr = BLR( + penalty=penalty, + lambda_reg=lambda_reg, + use_bias=True, + bias_scaling=bias_scaling, + prior=prior, + solver=solver, + max_iter=max_iter, + dual=dual, + tol=tol, + verbose=verbose, + warm_start=False, + lr_seed=lr_seed, + ) @property def prior(self): return self.lr.prior - def get_fusion_params(self, idx): return self.weights[idx], self.bias[idx], self.system_idx[idx] - - - def _predict_fus_idx(self, x, fus_idx, eval_type='logit'): + def _predict_fus_idx(self, x, fus_idx, eval_type="logit"): w, b, idx = self.get_fusion_params(fus_idx) x = x[:, idx] y = np.dot(x, w).ravel() + b - - if eval_type == 'log-post': - y = np.log(softmax(y + np.log(self.priors), axis=1)+1e-10) - if eval_type == 'post': + + if eval_type == "log-post": + y = np.log(softmax(y + np.log(self.priors), axis=1) + 1e-10) + if eval_type == "post": y = softmax(y + np.log(self.priors)) return y - - def predict(self, x, fus_idx=None, eval_type='logit'): + def predict(self, x, fus_idx=None, eval_type="logit"): if fus_idx is None: y = [] @@ -77,14 +95,12 @@ def predict(self, x, fus_idx=None, eval_type='logit'): return self._predict_fus_idx(x, fus_idx, eval_type) - - def fit(self, x, class_ids, sample_weights=None): - + num_systems = x.shape[1] if self.max_systems is None: self.max_systems = 10 - + self.max_systems = min(self.max_systems, num_systems) self.weights = [] @@ -93,13 +109,13 @@ def fit(self, x, class_ids, sample_weights=None): fus_min_dcf = np.zeros((self.max_systems,), dtype=float_cpu()) fus_act_dcf = np.zeros((self.max_systems,), dtype=float_cpu()) for i in range(self.max_systems): - cand_systems = np.arange(num_systems, dtype='int32') - fixed_systems = np.array([], dtype='int32') + cand_systems = np.arange(num_systems, dtype="int32") + fixed_systems = np.array([], dtype="int32") if i > 0: - fixed_systems = self.system_idx[i-1] + fixed_systems = self.system_idx[i - 1] cand_systems[fixed_systems] = -1 - cand_systems = cand_systems[cand_systems>-1] - + cand_systems = cand_systems[cand_systems > -1] + num_cands = len(cand_systems) cand_min_dcf = np.zeros((num_cands,), dtype=float_cpu()) cand_act_dcf = np.zeros((num_cands,), dtype=float_cpu()) @@ -107,41 +123,44 @@ def fit(self, x, class_ids, sample_weights=None): cand_weights = [] for j in range(num_cands): system_idx_ij = np.concatenate( - (fixed_systems, - np.expand_dims(cand_systems[j], axis=0)), axis=0) + (fixed_systems, np.expand_dims(cand_systems[j], axis=0)), axis=0 + ) x_ij = x[:, system_idx_ij] self.lr.fit(x_ij, class_ids) cand_weights.append([self.lr.A, self.lr.b]) all_pos[j] = np.all(self.lr.A > 0) - + y_ij = self.lr.predict(x_ij) - tar = y_ij[class_ids==1] - non = y_ij[class_ids==0] + tar = y_ij[class_ids == 1] + non = y_ij[class_ids == 0] min_dcf, act_dcf, _, _ = dcf.fast_eval_dcf_eer( - tar, non, self.prior_eval) + tar, non, self.prior_eval + ) cand_min_dcf[j] = np.mean(min_dcf) cand_act_dcf[j] = np.mean(act_dcf) - + fus_name = self._make_fus_name(system_idx_ij) - logging.info('fus_sys=%s min_dcf=%.3f act_dcf=%.3f' % ( - fus_name, cand_min_dcf[j], cand_act_dcf[j])) - + logging.info( + "fus_sys=%s min_dcf=%.3f act_dcf=%.3f" + % (fus_name, cand_min_dcf[j], cand_act_dcf[j]) + ) + dcf_best = 100 if self.prioritize_positive: allpos_cand_act_dcf = np.copy(cand_act_dcf) - allpos_cand_act_dcf[all_pos==False] = 100 + allpos_cand_act_dcf[all_pos == False] = 100 j_best = np.argmin(allpos_cand_act_dcf) dcf_best = allpos_cand_act_dcf[j_best] - + if dcf_best == 100: j_best = np.argmin(cand_act_dcf) dcf_best = cand_act_dcf[j_best] - + select_system = np.asarray([cand_systems[j_best]]) - if i==0: + if i == 0: fus_system_i = select_system else: - fus_system_i = np.concatenate((self.system_idx[i-1], select_system)) + fus_system_i = np.concatenate((self.system_idx[i - 1], select_system)) self.system_idx.append(fus_system_i) @@ -154,66 +173,82 @@ def fit(self, x, class_ids, sample_weights=None): # print report for i in range(self.max_systems): fus_name = self._make_fus_name(self.system_idx[i]) - weights_str = np.array2string(self.weights[i].ravel(), separator=',').replace('\r', '').replace('\n', '') - bias_str = np.array2string(self.bias[i], separator=',') - logging.info('Best-%d=%s min_dcf=%.3f act_dcf=%.3f weights=%s bias=%s' % ( - i+1,fus_name,fus_min_dcf[i],fus_act_dcf[i], weights_str, bias_str)) - + weights_str = ( + np.array2string(self.weights[i].ravel(), separator=",") + .replace("\r", "") + .replace("\n", "") + ) + bias_str = np.array2string(self.bias[i], separator=",") + logging.info( + "Best-%d=%s min_dcf=%.3f act_dcf=%.3f weights=%s bias=%s" + % ( + i + 1, + fus_name, + fus_min_dcf[i], + fus_act_dcf[i], + weights_str, + bias_str, + ) + ) + return fus_min_dcf, fus_act_dcf - def _make_fus_name(self, idx): sys_names = [self.system_names[i] for i in idx] - fus_name = '+'.join(sys_names) + fus_name = "+".join(sys_names) return fus_name - - def get_config(self): - config = { 'bias_scaling': self.lr.bias_scaling, - 'prior': self.lr.prior } + config = {"bias_scaling": self.lr.bias_scaling, "prior": self.lr.prior} base_config = super(GreedyFusionBinaryLR, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): weights = np.concatenate(tuple(self.weights), axis=0) bias = np.concatenate(tuple(self.bias)) system_idx = np.concatenate(tuple(self.system_idx), axis=0) - system_names = np.asarray(self.system_names, dtype='S') - #print(system_names) - #print(system_names.astype('S')) - params = { 'weights': weights, - 'bias': bias, - 'system_idx': system_idx, - 'system_names': system_names} - dtypes = { 'weights': float_save(), 'bias': float_save(), 'system_idx': 'int32', 'system_names': 'S'} - + system_names = np.asarray(self.system_names, dtype="S") + # print(system_names) + # print(system_names.astype('S')) + params = { + "weights": weights, + "bias": bias, + "system_idx": system_idx, + "system_names": system_names, + } + dtypes = { + "weights": float_save(), + "bias": float_save(), + "system_idx": "int32", + "system_names": "S", + } + self._save_params_from_dict(f, params, dtypes=dtypes) - @classmethod def load_params(cls, f, config): - param_list = ['weights', 'bias', 'system_idx', 'system_names'] - dtypes = { 'weights': float_cpu(), 'bias': float_cpu(), 'system_idx': 'int32', 'system_names': 'S'} - params = cls._load_params_to_dict(f, config['name'], param_list, dtypes) - #print(params) + param_list = ["weights", "bias", "system_idx", "system_names"] + dtypes = { + "weights": float_cpu(), + "bias": float_cpu(), + "system_idx": "int32", + "system_names": "S", + } + params = cls._load_params_to_dict(f, config["name"], param_list, dtypes) + # print(params) weights = [] system_idx = [] i = 1 j = 0 - while j < params['weights'].shape[0]: - weights.append(params['weights'][j:j+i,:]) - system_idx.append(params['system_idx'][j:j+i]) - j+=i - i+=1 - - - params['weights'] = weights - params['system_idx'] = system_idx - params['system_names'] = [ - t.decode('utf-8') for t in params['system_names']] - #print(params) + while j < params["weights"].shape[0]: + weights.append(params["weights"][j : j + i, :]) + system_idx.append(params["system_idx"][j : j + i]) + j += i + i += 1 + + params["weights"] = weights + params["system_idx"] = system_idx + params["system_names"] = [t.decode("utf-8") for t in params["system_names"]] + # print(params) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) diff --git a/hyperion/classifiers/linear_gbe.py b/hyperion/classifiers/linear_gbe.py index c17cb337..4337c5f0 100644 --- a/hyperion/classifiers/linear_gbe.py +++ b/hyperion/classifiers/linear_gbe.py @@ -12,17 +12,26 @@ from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax - class LinearGBE(HypModel): + def __init__( + self, + mu=None, + W=None, + update_mu=True, + update_W=True, + x_dim=1, + num_classes=None, + balance_class_weight=True, + beta=None, + nu=None, + prior=None, + prior_beta=None, + prior_nu=None, + post_beta=None, + post_nu=None, + **kwargs + ): - def __init__(self, mu=None, W=None, - update_mu=True, update_W=True, - x_dim=1, num_classes=None, balance_class_weight=True, - beta=None, nu=None, - prior=None, prior_beta=None, prior_nu=None, - post_beta=None, post_nu=None, - **kwargs): - super(LinearGBE, self).__init__(**kwargs) if mu is not None: num_classes = mu.shape[0] @@ -42,115 +51,107 @@ def __init__(self, mu=None, W=None, self.nu = nu self.prior_beta = prior_beta self.prior_nu = prior_nu - self.post_beta= post_beta + self.post_beta = post_beta self.post_nu = post_nu self._compute_Ab() - - def get_config(self): - config = { 'update_mu': self.update_mu, - 'update_W': self.update_W, - 'x_dim': self.x_dim, - 'num_classes': self.num_classes, - 'balance_class_weight': self.balance_class_weight, - 'prior_beta': self.prior_beta, - 'prior_nu': self.prior_nu, - 'post_beta': self.post_beta, - 'post_nu': self.post_nu } - + config = { + "update_mu": self.update_mu, + "update_W": self.update_W, + "x_dim": self.x_dim, + "num_classes": self.num_classes, + "balance_class_weight": self.balance_class_weight, + "prior_beta": self.prior_beta, + "prior_nu": self.prior_nu, + "post_beta": self.post_beta, + "post_nu": self.post_nu, + } + base_config = super(LinearGBE, self).get_config() return dict(list(base_config.items()) + list(config.items())) - def _load_prior(self): if isinstance(self.prior, str): self.prior = LinearGBE.load(self.prior) num_classes = self.prior.mu.shape[0] if self.prior_beta is not None: - self.prior.beta = self.prior_beta*np.ones((num_classes,), dtype=float_cpu()) + self.prior.beta = self.prior_beta * np.ones( + (num_classes,), dtype=float_cpu() + ) if self.prior_nu is not None: - self.prior.nu = num_classes*self.prior_nu - + self.prior.nu = num_classes * self.prior_nu - def _change_post_r(self): - + if self.post_beta is not None: - self.beta = self.post_beta*np.ones((self.num_classes,), dtype=float_cpu()) + self.beta = self.post_beta * np.ones((self.num_classes,), dtype=float_cpu()) if self.post_nu is not None: - self.nu = self.num_classes*self.post_nu - - + self.nu = self.num_classes * self.post_nu def eval_linear(self, x): return np.dot(x, self.A) + self.b - - def eval_llk(self, x): logp = np.dot(x, self.A) + self.b - K = 0.5*logdet_pdmat(self.W) - 0.5*self.x_dim*np.log(2*np.pi) - K += -0.5*np.sum(np.dot(x, self.W)*x, axis=1, keepdims=True) + K = 0.5 * logdet_pdmat(self.W) - 0.5 * self.x_dim * np.log(2 * np.pi) + K += -0.5 * np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) logp += K return logp - - def eval_predictive(self, x): - K = self.W/self.nu - c = (self.nu+1-self.x_dim) - r = self.beta/(self.beta+1) - + K = self.W / self.nu + c = self.nu + 1 - self.x_dim + r = self.beta / (self.beta + 1) + # T(mu, L, c) ; L = c r K - - logg = gammaln((c+self.x_dim)/2) - gammaln(c/2) - 0.5*self.x_dim*np.log(c*np.pi) - # 0.5*log|L| = 0.5*log|K| + 0.5*d*log(c r) + logg = ( + gammaln((c + self.x_dim) / 2) + - gammaln(c / 2) + - 0.5 * self.x_dim * np.log(c * np.pi) + ) + + # 0.5*log|L| = 0.5*log|K| + 0.5*d*log(c r) logK = logdet_pdmat(K) - logL_div_2 = 0.5*logK + 0.5*self.x_dim*r - + logL_div_2 = 0.5 * logK + 0.5 * self.x_dim * r + # delta2_0 = (x-mu)^T W (x-mu) - delta2_0 = np.sum(np.dot(x, self.W)*x, axis=1, keepdims=True) - 2*( - np.dot(x, self.A) + self.b) + delta2_0 = np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) - 2 * ( + np.dot(x, self.A) + self.b + ) # delta2 = (x-mu)^T L (x-mu) = c r delta0 / nu # delta2/c = r delta0 / nu - delta2_div_c = r*delta2_0/self.nu + delta2_div_c = r * delta2_0 / self.nu - D = -0.5*(c+self.x_dim)*np.log(1+delta2_div_c) + D = -0.5 * (c + self.x_dim) * np.log(1 + delta2_div_c) logging.debug(self.nu) logging.debug(c) logging.debug(self.x_dim) logging.debug(logg) logging.debug(logL_div_2.shape) logging.debug(D.shape) - + logp = logg + logL_div_2 + D return logp - - - def predict(self, x, eval_method='linear', normalize=False): - if eval_method == 'linear': + def predict(self, x, eval_method="linear", normalize=False): + if eval_method == "linear": logp = self.eval_linear(x) - elif eval_method == 'llk': + elif eval_method == "llk": logp = self.eval_llk(x) - elif eval_method == 'predictive': + elif eval_method == "predictive": logp = self.eval_predictive(x) else: - raise ValueError('wrong eval method %s' % eval_method) - + raise ValueError("wrong eval method %s" % eval_method) + if normalize: logp = np.log(softmax(logp, axis=1)) - - return logp - + return logp - - def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): assert class_ids is not None or p_theta is not None @@ -158,201 +159,202 @@ def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): do_map = True if self.prior is not None else False if do_map: self._load_prior() - + self.x_dim = x.shape[-1] if self.num_classes is None: if class_ids is not None: - self.num_classes = np.max(class_ids)+1 + self.num_classes = np.max(class_ids) + 1 else: self.num_classes = p_theta.shape[-1] - + if class_ids is not None: p_theta = int2onehot(class_ids, self.num_classes) if sample_weight is not None: - p_theta = sample_weight[:, None]*p_theta - + p_theta = sample_weight[:, None] * p_theta + N = np.sum(p_theta, axis=0) F = np.dot(p_theta.T, x) if self.update_mu: - xbar = F/N[:,None] + xbar = F / N[:, None] if do_map: - alpha_mu = (N/(N+self.prior.beta))[:, None] - self.mu = (1-alpha_mu)*self.prior.mu + alpha_mu*xbar - self.beta = N+self.prior.beta + alpha_mu = (N / (N + self.prior.beta))[:, None] + self.mu = (1 - alpha_mu) * self.prior.mu + alpha_mu * xbar + self.beta = N + self.prior.beta else: self.mu = xbar self.beta = N else: xbar = self.mu - if self.update_W: if do_map: nu0 = self.prior.nu S0 = invert_pdmat(self.prior.W, return_inv=True)[-1] if self.balance_class_weight: - alpha_W = (N/(N+nu0/self.num_classes))[:, None] - S = (self.num_classes - np.sum(alpha_W))*S0 + alpha_W = (N / (N + nu0 / self.num_classes))[:, None] + S = (self.num_classes - np.sum(alpha_W)) * S0 else: - S = nu0*S0 + S = nu0 * S0 else: nu0 = 0 S = np.zeros((x.shape[1], x.shape[1]), dtype=float_cpu()) - + for k in range(self.num_classes): delta = x - xbar[k] - S_k = np.dot(p_theta[:, k]*delta.T, delta) + S_k = np.dot(p_theta[:, k] * delta.T, delta) if do_map and self.update_mu: mu_delta = xbar[k] - self.prior.mu[k] - S_k += N[k]*(1-alpha_mu[k])*np.outer(mu_delta, mu_delta) + S_k += N[k] * (1 - alpha_mu[k]) * np.outer(mu_delta, mu_delta) if self.balance_class_weight: - S_k /= (N[k]+nu0/self.num_classes) + S_k /= N[k] + nu0 / self.num_classes S += S_k - + if self.balance_class_weight: S /= self.num_classes else: - S /= (nu0+np.sum(N)) + S /= nu0 + np.sum(N) self.W = invert_pdmat(S, return_inv=True)[-1] - self.nu = np.sum(N)+nu0 - + self.nu = np.sum(N) + nu0 + self._change_post_r() self._compute_Ab() - - def save_params(self, f): - params = { 'mu': self.mu, - 'W': self.W, - 'beta': self.beta, - 'nu': self.nu } + params = {"mu": self.mu, "W": self.W, "beta": self.beta, "nu": self.nu} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'W', 'beta', 'nu'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["mu", "W", "beta", "nu"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - def _compute_Ab(self): if self.mu is not None and self.W is not None: self.A = np.dot(self.W, self.mu.T) - self.b = -0.5 * np.sum(self.mu.T*self.A, axis=0) - + self.b = -0.5 * np.sum(self.mu.T * self.A, axis=0) - @staticmethod def filter_args(**kwargs): if prefix is None: - p = '' + p = "" else: - p = prefix + '_' - - valid_args = ('update_mu', 'update_W', - 'no_update_mu', 'no_update_W', - 'balance_class_weight', - 'prior', 'prior_beta', 'prior_nu', - 'post_beta', 'post_nu', - 'name') - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - if 'no_update_mu' in d: - d['update_mu'] = not d['no_update_mu'] - if 'no_update_W' in d: - d['update_W'] = not d['no_update_W'] - + p = prefix + "_" + + valid_args = ( + "update_mu", + "update_W", + "no_update_mu", + "no_update_W", + "balance_class_weight", + "prior", + "prior_beta", + "prior_nu", + "post_beta", + "post_nu", + "name", + ) + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + if "no_update_mu" in d: + d["update_mu"] = not d["no_update_mu"] + if "no_update_W" in d: + d["update_W"] = not d["no_update_W"] + return d filter_train_args = filter_args - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1+'no-update-mu', - default=False, action='store_true', - help='do not update mu') + p1 + "no-update-mu", + default=False, + action="store_true", + help="do not update mu", + ) parser.add_argument( - p1+'no-update-W', - default=False, action='store_true', - help='do not update W') + p1 + "no-update-W", + default=False, + action="store_true", + help="do not update W", + ) parser.add_argument( - p1+'balance-class-weight', - default=False, action='store_true', - help='Balances the weight of each class when computing W') + p1 + "balance-class-weight", + default=False, + action="store_true", + help="Balances the weight of each class when computing W", + ) parser.add_argument( - p1+'prior', - default=None, - help='prior file for MAP adaptation') + p1 + "prior", default=None, help="prior file for MAP adaptation" + ) parser.add_argument( - p1+'prior-beta', - default=16, type=float, - help='relevance factor for the means') + p1 + "prior-beta", + default=16, + type=float, + help="relevance factor for the means", + ) parser.add_argument( - p1+'prior-nu', - default=16, type=float, - help='relevance factor for the variances') + p1 + "prior-nu", + default=16, + type=float, + help="relevance factor for the variances", + ) parser.add_argument( - p1+'post-beta', - default=None, type=float, - help='relevance factor for the means') - parser.add_argument( - p1+'post-nu', - default=None, type=float, - help='relevance factor for the variances') - + p1 + "post-beta", + default=None, + type=float, + help="relevance factor for the means", + ) parser.add_argument( - p1+'name', - default='lgbe', - help='model name') - + p1 + "post-nu", + default=None, + type=float, + help="relevance factor for the variances", + ) + parser.add_argument(p1 + "name", default="lgbe", help="model name") @staticmethod def filter_eval_args(prefix, **kwargs): - valid_args = ('model_file', 'normalize', 'eval_method') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - + valid_args = ("model_file", "normalize", "eval_method") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_eval_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." + parser.add_argument(p1 + "model-file", required=True, help=("model file")) parser.add_argument( - p1+'model-file', required=True, - help=('model file')) + p1 + "normalize", + default=False, + action="store_true", + help=("normalizes the ouput probabilities to sum to one"), + ) parser.add_argument( - p1+'normalize', default=False, - action='store_true', - help=('normalizes the ouput probabilities to sum to one')) - parser.add_argument( - p1+'eval-method', default='linear', - choices=['linear','llk','predictive'], - help=('evaluates full gaussian likelihood, linear function' - 'or predictive distribution')) - - + p1 + "eval-method", + default="linear", + choices=["linear", "llk", "predictive"], + help=( + "evaluates full gaussian likelihood, linear function" + "or predictive distribution" + ), + ) + add_argparse_args = add_class_args add_argparse_train_args = add_class_args add_argparse_eval_args = add_eval_args diff --git a/hyperion/classifiers/linear_gbe1.py b/hyperion/classifiers/linear_gbe1.py index 7f9e7417..71edd606 100644 --- a/hyperion/classifiers/linear_gbe1.py +++ b/hyperion/classifiers/linear_gbe1.py @@ -10,14 +10,21 @@ from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax - class LinearGBE(HypModel): - - def __init__(self, mu=None, W=None, - update_mu=True, update_W=True, - x_dim=1, num_classes=None, balance_class_weight=True, - do_map=False, r_mu=16, r_W=16, - **kwargs): + def __init__( + self, + mu=None, + W=None, + update_mu=True, + update_W=True, + x_dim=1, + num_classes=None, + balance_class_weight=True, + do_map=False, + r_mu=16, + r_W=16, + **kwargs + ): super(LinearGBE, self).__init__(**kwargs) if mu is not None: num_classes = mu.shape[0] @@ -37,38 +44,34 @@ def __init__(self, mu=None, W=None, self.r_W = r_W self._compute_Ab() - - def get_config(self): - config = { 'update_mu': self.update_mu, - 'update_W': self.update_W, - 'x_dim': self.x_dim, - 'num_classes': self.num_classes, - 'balance_class_weight': self.balance_class_weight, - 'do_map': self.do_map, - 'r_mu': self.r_mu, - 'r_W': self.r_W} + config = { + "update_mu": self.update_mu, + "update_W": self.update_W, + "x_dim": self.x_dim, + "num_classes": self.num_classes, + "balance_class_weight": self.balance_class_weight, + "do_map": self.do_map, + "r_mu": self.r_mu, + "r_W": self.r_W, + } base_config = super(LinearGBE, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def predict(self, x, normalize=False, return_full_llk=False): logp = np.dot(x, self.A) + self.b - + if return_full_llk: - K = 0.5*logdet_pdmat(self.W) - 0.5*self.x_dim*np.log(2*np.pi) - K += -0.5*np.sum(np.dot(x, self.W)*x, axis=1, keepdims=True) + K = 0.5 * logdet_pdmat(self.W) - 0.5 * self.x_dim * np.log(2 * np.pi) + K += -0.5 * np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) logp += K - + if normalize: logp = np.log(softmax(logp, axis=1)) - - return logp + return logp - def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): assert class_ids is not None or p_theta is not None @@ -76,16 +79,16 @@ def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): self.x_dim = x.shape[-1] if self.num_classes is None: if class_ids is not None: - self.num_classes = np.max(class_ids)+1 + self.num_classes = np.max(class_ids) + 1 else: self.num_classes = p_theta.shape[-1] - + if class_ids is not None: p_theta = int2onehot(class_ids, self.num_classes) if sample_weight is not None: - p_theta = sample_weight[:, None]*p_theta - + p_theta = sample_weight[:, None] * p_theta + N = np.sum(p_theta, axis=0) F = np.dot(p_theta.T, x) @@ -93,142 +96,169 @@ def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): mu0 = self.mu xbar = mu0 if self.update_mu: - xbar = F/N[:,None] + xbar = F / N[:, None] if self.do_map: - alpha = (N/(N+self.r_mu))[:, None] - self.mu = (1-alpha)*mu0 + alpha*xbar + alpha = (N / (N + self.r_mu))[:, None] + self.mu = (1 - alpha) * mu0 + alpha * xbar else: self.mu = xbar - + if self.update_W: if self.do_map: r_W = self.r_W - alpha = (N/(N+r_W))[:, None] + alpha = (N / (N + r_W))[:, None] S0 = invert_pdmat(self.W, return_inv=True)[-1] if self.balance_class_weight: - S = (self.num_classes - np.sum(alpha))*S0 + S = (self.num_classes - np.sum(alpha)) * S0 else: - S = self.num_classes*self.r_W*S0 + S = self.num_classes * self.r_W * S0 else: r_W = 0 S = np.zeros((x.shape[1], x.shape[1]), dtype=float_cpu()) - + for k in range(self.num_classes): delta = x - xbar[k] - S_k = np.dot(p_theta[:, k]*delta.T, delta) + S_k = np.dot(p_theta[:, k] * delta.T, delta) if self.do_map: mu_delta = xbar[k] - mu0[k] - S_k += self.r_W*alpha[k]*np.outer(mu_delta, mu_delta) + S_k += self.r_W * alpha[k] * np.outer(mu_delta, mu_delta) if self.balance_class_weight: - S_k /= (N[k]+r_W) + S_k /= N[k] + r_W S += S_k - + if self.balance_class_weight: S /= self.num_classes else: - S /= (self.num_classes*r_w+np.sum(N)) - + S /= self.num_classes * r_w + np.sum(N) + self.W = invert_pdmat(S, return_inv=True)[-1] self._compute_Ab() - def save_params(self, f): - params = { 'mu': self.mu, - 'W': self.W} + params = {"mu": self.mu, "W": self.W} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'W'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["mu", "W"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - def _compute_Ab(self): if self.mu is not None and self.W is not None: self.A = np.dot(self.W, self.mu.T) - self.b = -0.5 * np.sum(self.mu.T*self.A, axis=0) - + self.b = -0.5 * np.sum(self.mu.T * self.A, axis=0) @staticmethod def filter_args(**kwargs): - - valid_args = ('update_mu', 'update_W', - 'balance_class_weight', - 'do_map', 'r_mu', 'r_W', - 'name') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + + valid_args = ( + "update_mu", + "update_W", + "balance_class_weight", + "do_map", + "r_mu", + "r_W", + "name", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) filter_train_args = filter_args - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '.' - p2 = prefix + '.' - - parser.add_argument(p1+'no-update-mu', - default=True, action='store_false', - help='not update mu') - parser.add_argument(p1+'no-update-W', dest=(p2+'update_W'), - default=True, action='store_false', - help='not update W') - parser.add_argument(p1+'balance-class-weight', dest=(p2+'balance_class_weight'), - default=False, action='store_true', - help='Balances the weight of each class when computing W') - parser.add_argument(p1+'do-map', dest=(p2+'do_map'), - default=False, action='store_true', - help='does MAP adaptation') - parser.add_argument(p1+'r-mu', dest=(p2+'r_mu'), - default=16, type=float, - help='relevance factor for the means') - parser.add_argument(p1+'r-w', dest=(p2+'r_W'), - default=16, type=float, - help='relevance factor for the variances') - - parser.add_argument(p1+'name', dest=(p2+'name'), - default='lgbe', - help='model name') - - + p1 = "--" + prefix + "." + p2 = prefix + "." + + parser.add_argument( + p1 + "no-update-mu", + default=True, + action="store_false", + help="not update mu", + ) + parser.add_argument( + p1 + "no-update-W", + dest=(p2 + "update_W"), + default=True, + action="store_false", + help="not update W", + ) + parser.add_argument( + p1 + "balance-class-weight", + dest=(p2 + "balance_class_weight"), + default=False, + action="store_true", + help="Balances the weight of each class when computing W", + ) + parser.add_argument( + p1 + "do-map", + dest=(p2 + "do_map"), + default=False, + action="store_true", + help="does MAP adaptation", + ) + parser.add_argument( + p1 + "r-mu", + dest=(p2 + "r_mu"), + default=16, + type=float, + help="relevance factor for the means", + ) + parser.add_argument( + p1 + "r-w", + dest=(p2 + "r_W"), + default=16, + type=float, + help="relevance factor for the variances", + ) + + parser.add_argument( + p1 + "name", dest=(p2 + "name"), default="lgbe", help="model name" + ) @staticmethod def filter_eval_args(**kwargs): - valid_args = ('model_file', 'normalize', 'return_full_llk') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("model_file", "normalize", "return_full_llk") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_argparse_eval_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '.' - p2 = prefix + '.' - - parser.add_argument(p1+'model-file', dest=(p2+'model_file'), required=True, - help=('model file')) - parser.add_argument(p1+'normalize', dest=(p2+'normalize'), default=False, - action='store_true', - help=('normalizes the ouput probabilities to sum to one')) - parser.add_argument(p1+'return-full-llk', dest=(p2+'return_full_llk'), default=False, - action='store_true', - help=('evaluates full gaussian likelihood instead of linear function')) - - + p1 = "--" + prefix + "." + p2 = prefix + "." + + parser.add_argument( + p1 + "model-file", + dest=(p2 + "model_file"), + required=True, + help=("model file"), + ) + parser.add_argument( + p1 + "normalize", + dest=(p2 + "normalize"), + default=False, + action="store_true", + help=("normalizes the ouput probabilities to sum to one"), + ) + parser.add_argument( + p1 + "return-full-llk", + dest=(p2 + "return_full_llk"), + default=False, + action="store_true", + help=("evaluates full gaussian likelihood instead of linear function"), + ) + add_argparse_args = add_class_args add_argparse_train_args = add_class_args add_argparse_eval_args = add_eval_args diff --git a/hyperion/classifiers/linear_gbe_up.py b/hyperion/classifiers/linear_gbe_up.py index 4ab7a3d4..8c855dfa 100644 --- a/hyperion/classifiers/linear_gbe_up.py +++ b/hyperion/classifiers/linear_gbe_up.py @@ -9,180 +9,196 @@ from ..hyp_defs import float_cpu from ..hyp_model import HypModel -from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax, fullcov_varfloor +from ..utils.math import ( + int2onehot, + logdet_pdmat, + invert_pdmat, + softmax, + fullcov_varfloor, +) from .linear_gbe import LinearGBE class LinearGBEUP(LinearGBE): + def __init__( + self, + mu=None, + W=None, + update_mu=True, + update_W=True, + x_dim=1, + num_classes=None, + balance_class_weight=True, + beta=None, + nu=None, + prior=None, + prior_beta=None, + prior_nu=None, + post_beta=None, + post_nu=None, + **kwargs + ): - def __init__(self, mu=None, W=None, - update_mu=True, update_W=True, - x_dim=1, num_classes=None, balance_class_weight=True, - beta=None, nu=None, - prior=None, prior_beta=None, prior_nu=None, - post_beta=None, post_nu=None, - **kwargs): - super(LinearGBEUP, self).__init__( - mu=mu, W=W, - update_mu=update_mu, update_W=update_W, - x_dim=x_dim, num_classes=num_classes, + mu=mu, + W=W, + update_mu=update_mu, + update_W=update_W, + x_dim=x_dim, + num_classes=num_classes, balance_class_weight=balance_class_weight, - beta=beta, nu=nu, - prior=prior, prior_beta=prior_beta, prior_nu=prior_nu, - post_beta=post_beta, post_nu=post_nu, - **kwargs) - - - + beta=beta, + nu=nu, + prior=prior, + prior_beta=prior_beta, + prior_nu=prior_nu, + post_beta=post_beta, + post_nu=post_nu, + **kwargs + ) def eval_linear(self, x): - x_m = x[:,:x.shape[-1]/2] - x_s = x[:,x.shape[-1]/2:] + x_m = x[:, : x.shape[-1] / 2] + x_s = x[:, x.shape[-1] / 2 :] try: S = invert_pdmat(self.W, return_inv=True)[-1] except: -# self.W += np.mean(np.diag(self.W))/1000*np.eye(x.shape[-1]/2) + # self.W += np.mean(np.diag(self.W))/1000*np.eye(x.shape[-1]/2) S = invert_pdmat(self.W, return_inv=True)[-1] - + logp = np.zeros((len(x), self.num_classes), dtype=float_cpu()) for i in range(x.shape[0]): - W_i = invert_pdmat(S+np.diag(x_s[i]), return_inv=True)[-1] + W_i = invert_pdmat(S + np.diag(x_s[i]), return_inv=True)[-1] A, b = self._compute_Ab_i(self.mu, W_i) logp[i] = np.dot(x_m[i], A) + b return logp - - def eval_llk(self, x): raise NotImplementedError logp = np.dot(x, self.A) + self.b - K = 0.5*logdet_pdmat(self.W) - 0.5*self.x_dim*np.log(2*np.pi) - K += -0.5*np.sum(np.dot(x, self.W)*x, axis=1, keepdims=True) + K = 0.5 * logdet_pdmat(self.W) - 0.5 * self.x_dim * np.log(2 * np.pi) + K += -0.5 * np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) logp += K return logp - - def eval_predictive(self, x): raise NotImplementedError - K = self.W/self.nu - c = (self.nu+1-self.x_dim) - r = self.beta/(self.beta+1) - + K = self.W / self.nu + c = self.nu + 1 - self.x_dim + r = self.beta / (self.beta + 1) + # T(mu, L, c) ; L = c r K - - logg = gammaln((c+self.x_dim)/2) - gammaln(c/2) - 0.5*self.x_dim*np.log(c*np.pi) - # 0.5*log|L| = 0.5*log|K| + 0.5*d*log(c r) + logg = ( + gammaln((c + self.x_dim) / 2) + - gammaln(c / 2) + - 0.5 * self.x_dim * np.log(c * np.pi) + ) + + # 0.5*log|L| = 0.5*log|K| + 0.5*d*log(c r) logK = logdet_pdmat(K) - logL_div_2 = 0.5*logK + 0.5*self.x_dim*r - + logL_div_2 = 0.5 * logK + 0.5 * self.x_dim * r + # delta2_0 = (x-mu)^T W (x-mu) - delta2_0 = np.sum(np.dot(x, self.W)*x, axis=1, keepdims=True) - 2*( - np.dot(x, self.A) + self.b) + delta2_0 = np.sum(np.dot(x, self.W) * x, axis=1, keepdims=True) - 2 * ( + np.dot(x, self.A) + self.b + ) # delta2 = (x-mu)^T L (x-mu) = c r delta0 / nu # delta2/c = r delta0 / nu - delta2_div_c = r*delta2_0/self.nu + delta2_div_c = r * delta2_0 / self.nu - D = -0.5*(c+self.x_dim)*np.log(1+delta2_div_c) + D = -0.5 * (c + self.x_dim) * np.log(1 + delta2_div_c) logging.debug(self.nu) logging.debug(c) logging.debug(self.x_dim) logging.debug(logg) logging.debug(logL_div_2.shape) logging.debug(D.shape) - + logp = logg + logL_div_2 + D return logp - def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): - x_m = x[:,:x.shape[-1]/2] - x_s = x[:,x.shape[-1]/2:] + x_m = x[:, : x.shape[-1] / 2] + x_s = x[:, x.shape[-1] / 2 :] x = x_m assert class_ids is not None or p_theta is not None do_map = True if self.prior is not None else False if do_map: self._load_prior() - + self.x_dim = x.shape[-1] if self.num_classes is None: if class_ids is not None: - self.num_classes = np.max(class_ids)+1 + self.num_classes = np.max(class_ids) + 1 else: self.num_classes = p_theta.shape[-1] - + if class_ids is not None: p_theta = int2onehot(class_ids, self.num_classes) if sample_weight is not None: - p_theta = sample_weight[:, None]*p_theta - + p_theta = sample_weight[:, None] * p_theta + N = np.sum(p_theta, axis=0) F = np.dot(p_theta.T, x) if self.update_mu: - xbar = F/N[:,None] + xbar = F / N[:, None] if do_map: - alpha_mu = (N/(N+self.prior.beta))[:, None] - self.mu = (1-alpha_mu)*self.prior.mu + alpha_mu*xbar - self.beta = N+self.prior.beta + alpha_mu = (N / (N + self.prior.beta))[:, None] + self.mu = (1 - alpha_mu) * self.prior.mu + alpha_mu * xbar + self.beta = N + self.prior.beta else: self.mu = xbar self.beta = N else: xbar = self.mu - if self.update_W: if do_map: nu0 = self.prior.nu S0 = invert_pdmat(self.prior.W, return_inv=True)[-1] if self.balance_class_weight: - alpha_W = (N/(N+nu0/self.num_classes))[:, None] - S = (self.num_classes - np.sum(alpha_W))*S0 + alpha_W = (N / (N + nu0 / self.num_classes))[:, None] + S = (self.num_classes - np.sum(alpha_W)) * S0 else: - S = nu0*S0 + S = nu0 * S0 else: nu0 = 0 S = np.zeros((x.shape[1], x.shape[1]), dtype=float_cpu()) - + for k in range(self.num_classes): delta = x - xbar[k] - S_k = np.dot(p_theta[:, k]*delta.T, delta) + S_k = np.dot(p_theta[:, k] * delta.T, delta) if do_map and self.update_mu: mu_delta = xbar[k] - self.prior.mu[k] - S_k += N[k]*(1-alpha_mu[k])*np.outer(mu_delta, mu_delta) + S_k += N[k] * (1 - alpha_mu[k]) * np.outer(mu_delta, mu_delta) if self.balance_class_weight: - S_k /= (N[k]+nu0/self.num_classes) + S_k /= N[k] + nu0 / self.num_classes S += S_k - + if self.balance_class_weight: S /= self.num_classes else: - S /= (nu0+np.sum(N)) + S /= nu0 + np.sum(N) - x_s_mean=np.diag(np.mean(x_s, axis=0)) - S = fullcov_varfloor(S, np.sqrt(x_s_mean)*1.1) + x_s_mean = np.diag(np.mean(x_s, axis=0)) + S = fullcov_varfloor(S, np.sqrt(x_s_mean) * 1.1) S -= x_s_mean - + self.W = invert_pdmat(S, return_inv=True)[-1] - self.nu = np.sum(N)+nu0 - + self.nu = np.sum(N) + nu0 + self._change_post_r() self._compute_Ab() - - @staticmethod def _compute_Ab_i(mu, W): A = np.dot(W, mu.T) - b = -0.5 * np.sum(mu.T*A, axis=0) + b = -0.5 * np.sum(mu.T * A, axis=0) return A, b - - diff --git a/hyperion/classifiers/linear_svmc.py b/hyperion/classifiers/linear_svmc.py index 8023aa26..7da4b2dc 100644 --- a/hyperion/classifiers/linear_svmc.py +++ b/hyperion/classifiers/linear_svmc.py @@ -6,7 +6,7 @@ import logging import numpy as np -from sklearn.svm import LinearSVC as SVC +from sklearn.svm import LinearSVC as SVC from ..hyp_defs import float_cpu from ..hyp_model import HypModel @@ -14,19 +14,31 @@ class LinearSVMC(HypModel): + def __init__( + self, + A=None, + b=None, + penalty="l2", + C=1.0, + loss="squared_hinge", + use_bias=True, + bias_scaling=1, + class_weight=None, + random_state=None, + max_iter=100, + dual=True, + tol=0.0001, + multi_class="ovr", + verbose=0, + balance_class_weight=True, + lr_seed=1024, + **kwargs + ): - def __init__(self, A=None, b=None, penalty='l2', C=1.0, - loss='squared_hinge', - use_bias=True, bias_scaling=1, - class_weight=None, random_state=None, max_iter=100, - dual=True, tol=0.0001, multi_class='ovr', verbose=0, - balance_class_weight=True, lr_seed=1024, **kwargs): - - super().__init__(**kwargs) if class_weight is None and balance_class_weight: - class_weight = 'balanced' + class_weight = "balanced" if random_state is None: random_state = np.random.RandomState(seed=lr_seed) @@ -35,13 +47,20 @@ def __init__(self, A=None, b=None, penalty='l2', C=1.0, self.bias_scaling = bias_scaling self.balance_class_weight = balance_class_weight logging.debug(class_weight) - self.svm = SVC(penalty=penalty, C=C, loss=loss, - dual=dual, tol=tol, - fit_intercept=use_bias, intercept_scaling=bias_scaling, - class_weight=class_weight, - random_state=random_state, max_iter=max_iter, - multi_class=multi_class, verbose=verbose) - + self.svm = SVC( + penalty=penalty, + C=C, + loss=loss, + dual=dual, + tol=tol, + fit_intercept=use_bias, + intercept_scaling=bias_scaling, + class_weight=class_weight, + random_state=random_state, + max_iter=max_iter, + multi_class=multi_class, + verbose=verbose, + ) if A is not None: self.svm.coef_ = A.T @@ -49,168 +68,187 @@ def __init__(self, A=None, b=None, penalty='l2', C=1.0, if b is not None: self.svm.intercept_ = b - @property def A(self): return self.svm.coef_.T @property def b(self): - return self.svm.intercept_*self.bias_scaling - + return self.svm.intercept_ * self.bias_scaling def get_config(self): - config = { 'use_bias': self.use_bias, - 'bias_scaling': self.bias_scaling, - 'balance_class_weight': self.balance_class_weight } + config = { + "use_bias": self.use_bias, + "bias_scaling": self.bias_scaling, + "balance_class_weight": self.balance_class_weight, + } base_config = super(LinearSVMC, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - - def predict(self, x, eval_type='logit'): + def predict(self, x, eval_type="logit"): s = np.dot(x, self.A) + self.b - - if eval_type == 'bin-logpost': - return np.log(1+np.exp(-s)) - if eval_type == 'bin-post': - return 1/(1+np.exp(-s)) - if eval_type == 'cat-post': + + if eval_type == "bin-logpost": + return np.log(1 + np.exp(-s)) + if eval_type == "bin-post": + return 1 / (1 + np.exp(-s)) + if eval_type == "cat-post": return softmax(s) - if eval_type == 'cat-logpost': + if eval_type == "cat-logpost": return np.log(softmax(s)) - + return s - - def fit(self, x, class_ids, sample_weight=None): self.svm.fit(x, class_ids, sample_weight=sample_weight) def save_params(self, f): - params = { 'A': self.A, - 'b': self.b} + params = {"A": self.A, "b": self.b} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['A', 'b'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["A", "b"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - @staticmethod def filter_train_args(prefix=None, **kwargs): - - valid_args = ('penalty', 'C', 'loss', - 'use_bias', 'bias_scaling', - 'class_weight', 'lr_seed', 'max_iter', - 'dual', 'tol', 'multi_class', 'verbose', - 'balance_class_weight', 'name') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ( + "penalty", + "C", + "loss", + "use_bias", + "bias_scaling", + "class_weight", + "lr_seed", + "max_iter", + "dual", + "tol", + "multi_class", + "verbose", + "balance_class_weight", + "name", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_class_train_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '.' - p2 = prefix + '.' + p1 = "--" + prefix + "." + p2 = prefix + "." parser.add_argument( - p1+'penalty', - default='l2', choices=['l2', 'l1'], - help='used to specify the norm used in the penalization') - parser.add_argument( - p1+'c', dest=(p2+'C'), - default=1.0, type=float, - help='inverse of regularization strength') + p1 + "penalty", + default="l2", + choices=["l2", "l1"], + help="used to specify the norm used in the penalization", + ) parser.add_argument( - p1+'loss', - default='squared_hinge', choices=['hinge', 'squared_hinge'], - help='type of loss') - + p1 + "c", + dest=(p2 + "C"), + default=1.0, + type=float, + help="inverse of regularization strength", + ) parser.add_argument( - p1+'no-use-bias', dest=(p2+'use_bias'), - default=True, action='store_false', - help='Not use bias') + p1 + "loss", + default="squared_hinge", + choices=["hinge", "squared_hinge"], + help="type of loss", + ) + parser.add_argument( - p1+'bias-scaling', - default=1.0, type=float, - help=('useful only when the solver liblinear is used ' - 'and use_bias is set to True')) + p1 + "no-use-bias", + dest=(p2 + "use_bias"), + default=True, + action="store_false", + help="Not use bias", + ) parser.add_argument( - p1+'lr-seed', - default=1024, type=int, - help='random number generator seed') + p1 + "bias-scaling", + default=1.0, + type=float, + help=( + "useful only when the solver liblinear is used " + "and use_bias is set to True" + ), + ) parser.add_argument( - p1+'max-iter', - default=100, type=int, - help='only for the newton-cg, sag and lbfgs solvers') + p1 + "lr-seed", default=1024, type=int, help="random number generator seed" + ) parser.add_argument( - p1+'no-dual', dest=(p2+'dual'), - default=True, action='store_false', - help=('dual or primal formulation. ' - 'Dual formulation is only implemented for ' - 'l2 penalty with liblinear solver')) + p1 + "max-iter", + default=100, + type=int, + help="only for the newton-cg, sag and lbfgs solvers", + ) parser.add_argument( - p1+'tol', default=1e-4, type=float, - help='tolerance for stopping criteria') + p1 + "no-dual", + dest=(p2 + "dual"), + default=True, + action="store_false", + help=( + "dual or primal formulation. " + "Dual formulation is only implemented for " + "l2 penalty with liblinear solver" + ), + ) parser.add_argument( - p1+'multi-class', - default='ovr', choices=['ovr', 'crammer_singer'], - help=('ovr fits a binary problem for each class else ' - 'it minimizes the multinomial loss.')) + p1 + "tol", default=1e-4, type=float, help="tolerance for stopping criteria" + ) parser.add_argument( - p1+'verbose', - default=0, type=int, - help='For the liblinear and lbfgs solvers') - + p1 + "multi-class", + default="ovr", + choices=["ovr", "crammer_singer"], + help=( + "ovr fits a binary problem for each class else " + "it minimizes the multinomial loss." + ), + ) parser.add_argument( - p1+'balance-class-weight', - default=False, action='store_true', - help='Balances the weight of each class when computing W') + p1 + "verbose", + default=0, + type=int, + help="For the liblinear and lbfgs solvers", + ) parser.add_argument( - p1+'name', - default='svc', - help='model name') + p1 + "balance-class-weight", + default=False, + action="store_true", + help="Balances the weight of each class when computing W", + ) - + parser.add_argument(p1 + "name", default="svc", help="model name") @staticmethod def filter_eval_args(prefix, **kwargs): - valid_args = ('model_file', 'eval_type') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("model_file", "eval_type") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_class_eval_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '.' - p2 = prefix + '.' + p1 = "--" + prefix + "." + p2 = prefix + "." + parser.add_argument(p1 + "model-file", required=True, help=("model file")) parser.add_argument( - p1+'model-file', required=True, - help=('model file')) - parser.add_argument( - p1+'eval-type', default='logit', - choices=['logit','bin-logpost','bin-post','cat-logpost','cat-post'], - help=('type of evaluation')) + p1 + "eval-type", + default="logit", + choices=["logit", "bin-logpost", "bin-post", "cat-logpost", "cat-post"], + help=("type of evaluation"), + ) - add_argparse_train_args = add_class_train_args add_argparse_eval_args = add_class_eval_args diff --git a/hyperion/classifiers/logistic_regression.py b/hyperion/classifiers/logistic_regression.py index e8c856d0..48763a12 100644 --- a/hyperion/classifiers/logistic_regression.py +++ b/hyperion/classifiers/logistic_regression.py @@ -6,7 +6,7 @@ import logging import numpy as np -from sklearn.linear_model import LogisticRegression as LR +from sklearn.linear_model import LogisticRegression as LR from ..hyp_defs import float_cpu from ..hyp_model import HypModel @@ -14,58 +14,73 @@ class LogisticRegression(HypModel): - - def __init__(self, A=None, b=None, penalty='l2', lambda_reg=1e-5, - use_bias=True, bias_scaling=1, - priors=None, random_state=None, solver='lbfgs', max_iter=100, - dual=False, tol=0.0001, multi_class='multinomial', verbose=0, warm_start=True, num_jobs=1, - lr_seed=1024, **kwargs): - - """ Wrapper for sktlearn logistic regression. - penalty : str, ‘l1’ or ‘l2’, default: ‘l2’ , - Used to specify the norm used in the penalization. The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties. - New in version 0.19: l1 penalty with SAGA solver (allowing ‘multinomial’ + L1) - dual : bool, default: False - Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. - tol : float, default: 1e-4 - Tolerance for stopping criteria. - lambda_reg : float, default: 1e-5 - Regularization strength; must be a positive float. - use_bias : bool, default: True - Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. - bias_scaling : float, default 1. - Useful only when the solver ‘liblinear’ is used and use_bias is set to True. - In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. - Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. - priors : dict or ‘balanced' default: None - Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. - The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). - Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. - random_state : int, RandomState instance or None, optional, default: None - The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. - solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, - default: ‘liblinear’ Algorithm to use in the optimization problem. - For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and - ‘saga’ are faster for large ones. - For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ - handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. - ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas - ‘liblinear’ and ‘saga’ handle L1 penalty. - Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. - New in version 0.17: Stochastic Average Gradient descent solver. - New in version 0.19: SAGA solver. - max_iter : int, default: 100 - Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge. - multi_class : str, {‘ovr’, ‘multinomial’}, default: ‘ovr’ - Multiclass option can be either ‘ovr’ or ‘multinomial’. If the option chosen is ‘ovr’, then a binary problem is fit for each label. Else the loss minimised is the multinomial loss fit across the entire probability distribution. Does not work for liblinear solver. - New in version 0.18: Stochastic Average Gradient descent solver for ‘multinomial’ case. - verbose : int, default: 0 - For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. - warm_start : bool, default: False - When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. - New in version 0.17: warm_start to support lbfgs, newton-cg, sag, saga solvers. - n_jobs : int, default: 1 - Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”. This parameter is ignored when the ``solver``is set to ‘liblinear’ regardless of whether ‘multi_class’ is specified or not. If given a value of -1, all cores are used. + def __init__( + self, + A=None, + b=None, + penalty="l2", + lambda_reg=1e-5, + use_bias=True, + bias_scaling=1, + priors=None, + random_state=None, + solver="lbfgs", + max_iter=100, + dual=False, + tol=0.0001, + multi_class="multinomial", + verbose=0, + warm_start=True, + num_jobs=1, + lr_seed=1024, + **kwargs + ): + + """Wrapper for sktlearn logistic regression. + penalty : str, ‘l1’ or ‘l2’, default: ‘l2’ , + Used to specify the norm used in the penalization. The ‘newton-cg’, ‘sag’ and ‘lbfgs’ solvers support only l2 penalties. + New in version 0.19: l1 penalty with SAGA solver (allowing ‘multinomial’ + L1) + dual : bool, default: False + Dual or primal formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer dual=False when n_samples > n_features. + tol : float, default: 1e-4 + Tolerance for stopping criteria. + lambda_reg : float, default: 1e-5 + Regularization strength; must be a positive float. + use_bias : bool, default: True + Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function. + bias_scaling : float, default 1. + Useful only when the solver ‘liblinear’ is used and use_bias is set to True. + In this case, x becomes [x, bias_scaling], i.e. a “synthetic” feature with constant value equal to intercept_scaling is appended to the instance vector. The intercept becomes intercept_scaling * synthetic_feature_weight. + Note! the synthetic feature weight is subject to l1/l2 regularization as all other features. To lessen the effect of regularization on synthetic feature weight (and therefore on the intercept) bias_scaling has to be increased. + priors : dict or ‘balanced' default: None + Weights associated with classes in the form {class_label: weight}. If not given, all classes are supposed to have weight one. + The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as n_samples / (n_classes * np.bincount(y)). + Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. + random_state : int, RandomState instance or None, optional, default: None + The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; . Used when solver == ‘sag’ or ‘liblinear’. + solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, + default: ‘liblinear’ Algorithm to use in the optimization problem. + For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and + ‘saga’ are faster for large ones. + For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ + handle multinomial loss; ‘liblinear’ is limited to one-versus-rest schemes. + ‘newton-cg’, ‘lbfgs’ and ‘sag’ only handle L2 penalty, whereas + ‘liblinear’ and ‘saga’ handle L1 penalty. + Note that ‘sag’ and ‘saga’ fast convergence is only guaranteed on features with approximately the same scale. + New in version 0.17: Stochastic Average Gradient descent solver. + New in version 0.19: SAGA solver. + max_iter : int, default: 100 + Useful only for the newton-cg, sag and lbfgs solvers. Maximum number of iterations taken for the solvers to converge. + multi_class : str, {‘ovr’, ‘multinomial’}, default: ‘ovr’ + Multiclass option can be either ‘ovr’ or ‘multinomial’. If the option chosen is ‘ovr’, then a binary problem is fit for each label. Else the loss minimised is the multinomial loss fit across the entire probability distribution. Does not work for liblinear solver. + New in version 0.18: Stochastic Average Gradient descent solver for ‘multinomial’ case. + verbose : int, default: 0 + For the liblinear and lbfgs solvers set verbose to any positive number for verbosity. + warm_start : bool, default: False + When set to True, reuse the solution of the previous call to fit as initialization, otherwise, just erase the previous solution. Useless for liblinear solver. + New in version 0.17: warm_start to support lbfgs, newton-cg, sag, saga solvers. + n_jobs : int, default: 1 + Number of CPU cores used when parallelizing over classes if multi_class=’ovr’”. This parameter is ignored when the ``solver``is set to ‘liblinear’ regardless of whether ‘multi_class’ is specified or not. If given a value of -1, all cores are used. """ super(LogisticRegression, self).__init__(**kwargs) @@ -73,7 +88,7 @@ def __init__(self, A=None, b=None, penalty='l2', lambda_reg=1e-5, random_state = np.random.RandomState(seed=lr_seed) if bias_scaling is None: - if use_bias and solver == 'liblinear': + if use_bias and solver == "liblinear": bias_scaling = 100 else: bias_scaling = 1 @@ -84,20 +99,27 @@ def __init__(self, A=None, b=None, penalty='l2', lambda_reg=1e-5, self.lambda_reg = lambda_reg self.multi_class = multi_class print(locals()) - self.lr = LR(penalty=penalty, C=1/lambda_reg, - dual=dual, tol=tol, - fit_intercept=use_bias, intercept_scaling=bias_scaling, - random_state=random_state, - solver=solver, max_iter=max_iter, - multi_class=multi_class, - verbose=verbose, warm_start=warm_start, n_jobs=num_jobs) + self.lr = LR( + penalty=penalty, + C=1 / lambda_reg, + dual=dual, + tol=tol, + fit_intercept=use_bias, + intercept_scaling=bias_scaling, + random_state=random_state, + solver=solver, + max_iter=max_iter, + multi_class=multi_class, + verbose=verbose, + warm_start=warm_start, + n_jobs=num_jobs, + ) if A is not None: self.lr.coef_ = A.T if b is not None: - self.lr.intercept_ = b/self.bias_scaling - + self.lr.intercept_ = b / self.bias_scaling @property def A(self): @@ -105,197 +127,207 @@ def A(self): @property def b(self): - return self.lr.intercept_*self.bias_scaling - + return self.lr.intercept_ * self.bias_scaling def get_config(self): - config = { 'use_bias': self.use_bias, - 'bias_scaling': self.bias_scaling, - 'priors': self.priors } + config = { + "use_bias": self.use_bias, + "bias_scaling": self.bias_scaling, + "priors": self.priors, + } base_config = super(LogisticRegression, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - - def predict(self, x, eval_type='logit'): + def predict(self, x, eval_type="logit"): if x.ndim == 1: x = x[:, None] - + y = np.dot(x, self.A) + self.b - if eval_type == 'log-post': - y = np.log(softmax(y + np.log(self.priors), axis=1)+1e-10) - if eval_type == 'post': + if eval_type == "log-post": + y = np.log(softmax(y + np.log(self.priors), axis=1) + 1e-10) + if eval_type == "post": y = softmax(y + np.log(self.priors)) - + return y - - def fit(self, x, class_ids, sample_weight=None): if x.ndim == 1: x = x[:, None] - num_classes = np.max(class_ids)+1 + num_classes = np.max(class_ids) + 1 counts = np.bincount(class_ids) assert num_classes == len(counts) - + if self.priors is None: - priors = 1/num_classes * np.ones((num_classes,), dtype=float_cpu()) + priors = 1 / num_classes * np.ones((num_classes,), dtype=float_cpu()) else: priors = [self.priors[i] for i in range(num_classes)] - class_weights = priors/counts - + class_weights = priors / counts + if sample_weight is None: sample_weight = class_weights[class_ids] else: sample_weight *= class_weights[class_ids] - + self.lr.fit(x, class_ids, sample_weight=sample_weight) - if self.multi_class == 'ovr': - #adjust bias to produce log-llk ratios + if self.multi_class == "ovr": + # adjust bias to produce log-llk ratios if len(self.lr.intercept_) == 1: priors = self.priors[1] - self.lr.intercept_ -= np.log(priors/(1-priors))/self.bias_scaling + self.lr.intercept_ -= np.log(priors / (1 - priors)) / self.bias_scaling else: - #adjust bias to produce log-llk - self.lr.intercept_ -= np.log(self.priors)/self.bias_scaling - + # adjust bias to produce log-llk + self.lr.intercept_ -= np.log(self.priors) / self.bias_scaling - def save_params(self, f): - params = { 'A': self.A, - 'b': self.b} + params = {"A": self.A, "b": self.b} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['A', 'b'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["A", "b"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - @staticmethod def filter_args(prefix=None, **kwargs): if prefix is None: - p = '' + p = "" else: - p = prefix + '_' - - valid_args = ('penalty', 'lambda_reg', - 'use_bias', 'bias_scaling', 'no_use_bias', - 'priors', 'lr_seed', - 'solver', 'max_iter', - 'dual', 'tol', 'multi_class', 'verbose', - 'warm_start', 'no_warm_start', 'num_jobs', 'name') - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - if 'no_use_bias' in d: - d['use_bias'] = not d['no_use_bias'] - if 'no_warm_start' in d: - d['warm_start'] = not d['no_warm_start'] - + p = prefix + "_" + + valid_args = ( + "penalty", + "lambda_reg", + "use_bias", + "bias_scaling", + "no_use_bias", + "priors", + "lr_seed", + "solver", + "max_iter", + "dual", + "tol", + "multi_class", + "verbose", + "warm_start", + "no_warm_start", + "num_jobs", + "name", + ) + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + if "no_use_bias" in d: + d["use_bias"] = not d["no_use_bias"] + if "no_warm_start" in d: + d["warm_start"] = not d["no_warm_start"] + return d filter_train_args = filter_args - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1+'penalty', - default='l2', choices=['l2', 'l1'], - help='used to specify the norm used in the penalization') + p1 + "penalty", + default="l2", + choices=["l2", "l1"], + help="used to specify the norm used in the penalization", + ) parser.add_argument( - p1+'lambda-reg', - default=1e-5, type=float, - help='regularization strength') + p1 + "lambda-reg", default=1e-5, type=float, help="regularization strength" + ) parser.add_argument( - p1+'no-use-bias', - default=False, action='store_true', - help='Not use bias') + p1 + "no-use-bias", default=False, action="store_true", help="Not use bias" + ) parser.add_argument( - p1+'bias-scaling', - default=1.0, type=float, - help='useful only when the solver liblinear is used and use_bias is set to True') + p1 + "bias-scaling", + default=1.0, + type=float, + help="useful only when the solver liblinear is used and use_bias is set to True", + ) parser.add_argument( - p1+'lr-seed', - default=1024, type=int, - help='random number generator seed') + p1 + "lr-seed", default=1024, type=int, help="random number generator seed" + ) parser.add_argument( - p1+'solver', - default='lbfgs', - choices=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], - help='type of solver') + p1 + "solver", + default="lbfgs", + choices=["newton-cg", "lbfgs", "liblinear", "sag", "saga"], + help="type of solver", + ) parser.add_argument( - p1+'max-iter', - default=100, type=int, - help='only for the newton-cg, sag and lbfgs solvers') + p1 + "max-iter", + default=100, + type=int, + help="only for the newton-cg, sag and lbfgs solvers", + ) parser.add_argument( - p1+'dual', - default=False, action='store_true', - help=('dual or primal formulation. ' - 'Dual formulation is only implemented for ' - 'l2 penalty with liblinear solver')) + p1 + "dual", + default=False, + action="store_true", + help=( + "dual or primal formulation. " + "Dual formulation is only implemented for " + "l2 penalty with liblinear solver" + ), + ) parser.add_argument( - p1+'tol', default=1e-4, type=float, - help='tolerance for stopping criteria') + p1 + "tol", default=1e-4, type=float, help="tolerance for stopping criteria" + ) parser.add_argument( - p1+'multi-class', - default='ovr', choices=['ovr', 'multinomial'], - help=('ovr fits a binary problem for each class else ' - 'it minimizes the multinomial loss.' - 'Does not work for liblinear solver')) + p1 + "multi-class", + default="ovr", + choices=["ovr", "multinomial"], + help=( + "ovr fits a binary problem for each class else " + "it minimizes the multinomial loss." + "Does not work for liblinear solver" + ), + ) parser.add_argument( - p1+'verbose', - default=0, type=int, - help='For the liblinear and lbfgs solvers') + p1 + "verbose", + default=0, + type=int, + help="For the liblinear and lbfgs solvers", + ) parser.add_argument( - p1+'num-jobs', - default=1, type=int, - help='number of cores for ovr') + p1 + "num-jobs", default=1, type=int, help="number of cores for ovr" + ) parser.add_argument( - p1+'no-warm-start', - default=False, action='store_true', - help='don\'t use previous model to start') + p1 + "no-warm-start", + default=False, + action="store_true", + help="don't use previous model to start", + ) - parser.add_argument( - p1+'name', - default='lr', - help='model name') + parser.add_argument(p1 + "name", default="lr", help="model name") - @staticmethod def filter_eval_args(prefix, **kwargs): - valid_args = ('model_file', 'eval_type') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("model_file", "eval_type") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_eval_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." + parser.add_argument(p1 + "model-file", required=True, help=("model file")) parser.add_argument( - p1+'model-file', required=True, - help=('model file')) - parser.add_argument( - p1+'eval-type', default='logit', - choices=['logit','log-post','post'], - help=('type of evaluation')) - - + p1 + "eval-type", + default="logit", + choices=["logit", "log-post", "post"], + help=("type of evaluation"), + ) + add_argparse_args = add_class_args add_argparse_train_args = add_class_args add_argparse_eval_args = add_eval_args diff --git a/hyperion/classifiers/q_scoring_homo_gbe.py b/hyperion/classifiers/q_scoring_homo_gbe.py index e9a67a3a..83f2408b 100644 --- a/hyperion/classifiers/q_scoring_homo_gbe.py +++ b/hyperion/classifiers/q_scoring_homo_gbe.py @@ -12,14 +12,18 @@ from ..utils.math import int2onehot, logdet_pdmat, invert_pdmat, softmax - class QScoringHomoGBE(HypModel): - - def __init__(self, mu=None, W=None, N=None, - balance_class_weight=True, - prior=None, prior_N=None, - post_N=None, - **kwargs): + def __init__( + self, + mu=None, + W=None, + N=None, + balance_class_weight=True, + prior=None, + prior_N=None, + post_N=None, + **kwargs + ): super(QScoringHomoGBE, self).__init__(**kwargs) @@ -31,50 +35,40 @@ def __init__(self, mu=None, W=None, N=None, self.prior_N = prior_N self.post_N = post_N - - @property def x_dim(self): return None if self.mu is None else self.mu.shape[1] - - @property def num_classes(self): return None if self.mu is None else self.mu.shape[0] - - def get_config(self): - config = { 'balance_class_weight': self.balance_class_weight, - 'prior_N': self.prior_N } - + config = { + "balance_class_weight": self.balance_class_weight, + "prior_N": self.prior_N, + } + base_config = super(QScoringHomoGBE, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def _load_prior(self): if isinstance(self.prior, str): self.prior = QScoringHomoGBE.load(self.prior) num_classes = self.prior.mu.shape[0] if self.prior_N is not None: - self.prior.W = 1 + self.prior_N/np.mean(self.prior.N)*(self.W-1) - self.prior.N = self.prior_N*np.ones((num_classes,), dtype=float_cpu()) - + self.prior.W = 1 + self.prior_N / np.mean(self.prior.N) * (self.W - 1) + self.prior.N = self.prior_N * np.ones((num_classes,), dtype=float_cpu()) - def _change_post_N(self): if self.post_N is not None: logging.debug(self.N) logging.debug(self.W) - self.W = 1 + self.post_N/np.mean(self.N)*(self.W-1) - self.N = self.post_N*np.ones((self.num_classes,), dtype=float_cpu()) + self.W = 1 + self.post_N / np.mean(self.N) * (self.W - 1) + self.N = self.post_N * np.ones((self.num_classes,), dtype=float_cpu()) logging.debug(self.N) logging.debug(self.W) - - def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): assert class_ids is not None or p_theta is not None @@ -82,165 +76,148 @@ def fit(self, x, class_ids=None, p_theta=None, sample_weight=None): if do_map: self._load_prior() - x_dim = int(x.shape[-1]/2) + x_dim = int(x.shape[-1] / 2) if self.num_classes is None: if class_ids is not None: - num_classes = np.max(class_ids)+1 + num_classes = np.max(class_ids) + 1 else: num_classes = p_theta.shape[-1] else: num_classes = self.num_classes - + if class_ids is not None: p_theta = int2onehot(class_ids, num_classes) if sample_weight is not None: - p_theta = sample_weight[:, None]*p_theta + p_theta = sample_weight[:, None] * p_theta - mu_x = x[:,:x_dim] - s_x = x[:,x_dim:] + mu_x = x[:, :x_dim] + s_x = x[:, x_dim:] - prec_x = 1/s_x + prec_x = 1 / s_x N = np.sum(p_theta, axis=0) - eta = np.dot(p_theta.T, prec_x*mu_x) + eta = np.dot(p_theta.T, prec_x * mu_x) prec = 1 + np.dot(p_theta.T, prec_x - 1) if self.prior is not None: - eta += self.prior.W*self.prior.mu - prec += (self.prior.W - 1) + eta += self.prior.W * self.prior.mu + prec += self.prior.W - 1 N += self.prior.N - - C = 1/prec - self.mu = C*eta + + C = 1 / prec + self.mu = C * eta self.N = N if self.balance_class_weight: - prec = 1 + np.mean(prec-1, axis=0) + prec = 1 + np.mean(prec - 1, axis=0) else: - prec = 1 + np.sum(prec_x-1, axis=0)/num_classes + prec = 1 + np.sum(prec_x - 1, axis=0) / num_classes self.W = prec - - self._change_post_N() + self._change_post_N() def predict(self, x, normalize=False): - mu_x = x[:,:self.x_dim] - s_x = x[:,self.x_dim:] - prec_x = 1/s_x + mu_x = x[:, : self.x_dim] + s_x = x[:, self.x_dim :] + prec_x = 1 / s_x eta_e = self.mu * self.W L_e = self.W eta_t = prec_x * mu_x L_t = prec_x - L_et = L_t + L_e - 1 # (batch x dim) + L_et = L_t + L_e - 1 # (batch x dim) - C_et = 1/L_et # (batch x dim) - C_e = C_et - 1/L_e # (batch x dim) - C_t = C_et - 1/L_t # (batch x dim) + C_et = 1 / L_et # (batch x dim) + C_e = C_et - 1 / L_e # (batch x dim) + C_t = C_et - 1 / L_t # (batch x dim) - r_e = np.sum(np.log(L_e), axis=0, keepdims=True) + np.dot(eta_e*eta_e, C_e.T) # (num_classes x batch) - r_t = np.sum(np.log(L_t), axis=1, keepdims=True) + np.sum(C_t*eta_t**2, axis=1, keepdims=True) # (batch x 1) - r_et = -np.sum(np.log(L_et), axis=1, keepdims=True) + 2 * np.dot(eta_t*C_et, eta_e.T) # (batch x num_classes) - logp = 0.5*(r_et +r_e.T + r_t) + r_e = np.sum(np.log(L_e), axis=0, keepdims=True) + np.dot( + eta_e * eta_e, C_e.T + ) # (num_classes x batch) + r_t = np.sum(np.log(L_t), axis=1, keepdims=True) + np.sum( + C_t * eta_t ** 2, axis=1, keepdims=True + ) # (batch x 1) + r_et = -np.sum(np.log(L_et), axis=1, keepdims=True) + 2 * np.dot( + eta_t * C_et, eta_e.T + ) # (batch x num_classes) + logp = 0.5 * (r_et + r_e.T + r_t) if normalize: logp = np.log(softmax(logp, axis=1)) - + return logp - - def save_params(self, f): - params = { 'mu': self.mu, - 'W': self.W, - 'N': self.N } + params = {"mu": self.mu, "W": self.W, "N": self.N} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'W', 'N'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["mu", "W", "N"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - - - - @staticmethod def filter_train_args(prefix=None, **kwargs): - - valid_args = ('balance_class_weight', - 'prior', 'prior_N', 'post_N', - 'name') - - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - + + valid_args = ("balance_class_weight", "prior", "prior_N", "post_N", "name") + + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + return d - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1+'balance-class-weight', - default=False, action='store_true', - help='Balances the weight of each class when computing W') - parser.add_argument( - p1+'prior', - default=None, - help='prior file for MAP adaptation') + p1 + "balance-class-weight", + default=False, + action="store_true", + help="Balances the weight of each class when computing W", + ) parser.add_argument( - p1+'prior-N', - default=None, type=float, - help='relevance factor for prior') + p1 + "prior", default=None, help="prior file for MAP adaptation" + ) parser.add_argument( - p1+'post-N', - default=None, type=float, - help='relevance factor for posterior') - + p1 + "prior-N", default=None, type=float, help="relevance factor for prior" + ) parser.add_argument( - p1+'name', - default='q_scoring', - help='model name') + p1 + "post-N", + default=None, + type=float, + help="relevance factor for posterior", + ) + parser.add_argument(p1 + "name", default="q_scoring", help="model name") add_argparse_train_args = add_class_args - @staticmethod def filter_eval_args(prefix, **kwargs): - valid_args = ('model_file', 'normalize') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("model_file", "normalize") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_eval_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." + parser.add_argument(p1 + "model-file", required=True, help=("model file")) parser.add_argument( - p1+'model-file', required=True, - help=('model file')) - parser.add_argument( - p1+'normalize', default=False, - action='store_true', - help=('normalizes the ouput probabilities to sum to one')) - - + p1 + "normalize", + default=False, + action="store_true", + help=("normalizes the ouput probabilities to sum to one"), + ) + add_argparse_eval_args = add_eval_args diff --git a/hyperion/clustering/ahc.py b/hyperion/clustering/ahc.py index e51f326a..2f8dbe08 100644 --- a/hyperion/clustering/ahc.py +++ b/hyperion/clustering/ahc.py @@ -15,46 +15,40 @@ class AHC(HypModel): - - def __init__(self, method='average', metric='llr', **kwargs): + def __init__(self, method="average", metric="llr", **kwargs): super().__init__(**kwargs) self.method = method self.metric = metric self.Z = None self.flat_clusters = None - def fit(self, x, mask=None): if mask is not None: x = copy(x) - x[mask==False] = -1e10 + x[mask == False] = -1e10 idx = np.triu(np.ones_like(x, dtype=bool), k=1) scores = x[idx] - if self.metric == 'llr': + if self.metric == "llr": max_score = np.max(scores) - scores = - scores + max_score + scores = -scores + max_score self.Z = linkage(scores, method=self.method) - self.Z[:, 2] = - self.Z[:, 2] + max_score - elif self.metric == 'prob': + self.Z[:, 2] = -self.Z[:, 2] + max_score + elif self.metric == "prob": scores = 1 - scores self.Z = linkage(scores, method=self.method) self.Z[:, 2] = 1 - self.Z[:, 2] else: self.Z = linkage(scores, method=self.method, metric=self.metric) - - - def get_flat_clusters(self, t, criterion='threshold'): - if criterion == 'threshold': + def get_flat_clusters(self, t, criterion="threshold"): + if criterion == "threshold": return self.get_flat_clusters_from_thr(t) else: return self.get_flat_clusters_from_num_clusters(t) - - def get_flat_clusters_from_num_clusters(self, num_clusters): N = self.Z.shape[0] + 1 num_clusters = min(N, num_clusters) @@ -64,41 +58,37 @@ def get_flat_clusters_from_num_clusters(self, num_clusters): flat_clusters = np.arange(N, dtype=int) for i in range(p_idx): - segm_idx = np.logical_or(flat_clusters==self.Z[i,0], - flat_clusters==self.Z[i,1]) + segm_idx = np.logical_or( + flat_clusters == self.Z[i, 0], flat_clusters == self.Z[i, 1] + ) flat_clusters[segm_idx] = N + i _, flat_clusters = np.unique(flat_clusters, return_inverse=True) return flat_clusters - - def get_flat_clusters_from_thr(self, thr): - if self.metric == 'llr' or self.metric == 'prob': - idx = self.Z[:,2] >= thr + if self.metric == "llr" or self.metric == "prob": + idx = self.Z[:, 2] >= thr else: - idx = self.Z[:,2] <= thr + idx = self.Z[:, 2] <= thr num_clusters = self.Z.shape[0] + 1 - np.sum(idx) return self.get_flat_clusters_from_num_clusters(num_clusters) - - def compute_flat_clusters(): - N = self.Z.shape[0]+1 - flat_clusters = np.zeros((N,N), dtype=int) + N = self.Z.shape[0] + 1 + flat_clusters = np.zeros((N, N), dtype=int) flat_clusters[0] = np.arange(N, dtype=int) - for i in range(N-1): - flat_clusters[i+1] = flat_clusters[i] - segm_idx = np.logical_or(flat_clusters[i]==self.Z[i,0], - flat_clusters[i]==self.Z[i,1]) - flat_clusters[i+1][segm_idx] = N + i - - for i in range(1,N): + for i in range(N - 1): + flat_clusters[i + 1] = flat_clusters[i] + segm_idx = np.logical_or( + flat_clusters[i] == self.Z[i, 0], flat_clusters[i] == self.Z[i, 1] + ) + flat_clusters[i + 1][segm_idx] = N + i + + for i in range(1, N): _, flat_clusters[i] = np.unique(flat_clusters, return_inverse=True) self.flat_clusters = flat_clusters - - def evaluate_impurity_det(self, labels_true): if self.flat_clusters is None: self.compute_flat_clusters() @@ -112,5 +102,4 @@ def evaluate_impurity_det(self, labels_true): h[i] = homogeneity_score(labels_true, self.flat_clusters[i]) c[i] = completeness_score(labels_true, self.flat_clusters[i]) - return 1-h, 1-c - + return 1 - h, 1 - c diff --git a/hyperion/clustering/kmeans.py b/hyperion/clustering/kmeans.py index 217e2149..7da2bd01 100644 --- a/hyperion/clustering/kmeans.py +++ b/hyperion/clustering/kmeans.py @@ -13,14 +13,12 @@ class KMeans(HypModel): - def __init__(self, num_clusters, mu=None, rtol=0.001, **kwargs): super(KMeans, self).__init__(**kwargs) self.num_clusters = num_clusters self.mu = mu self.rtol = rtol - def fit(self, x, epochs=100): loss = np.zeros((epochs,), dtype=float_cpu()) self.mu = self._choose_seeds(x) @@ -30,43 +28,36 @@ def fit(self, x, epochs=100): cluster_index, err2 = self.predict(x) loss[epoch] = np.mean(err2) if epoch > 0: - delta = np.abs(loss[epoch-1]-loss[epoch])/loss[epoch-1] + delta = np.abs(loss[epoch - 1] - loss[epoch]) / loss[epoch - 1] if delta < self.rtol: - loss = loss[:epoch+1] + loss = loss[: epoch + 1] break return loss, cluster_index - - def _choose_seeds(self, x): mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) mu[0] = x[0] for i in range(1, self.num_clusters): d = np.zeros((x.shape[0],), dtype=float_cpu()) for j in range(i): - d += np.sum(np.square(x-mu[j]), axis=-1) + d += np.sum(np.square(x - mu[j]), axis=-1) index = np.argmax(d) mu[i] = x[index] return mu - - def _compute_centroids(self, x, index): mu = np.zeros((self.num_clusters, x.shape[-1]), dtype=float_cpu()) for k in range(self.num_clusters): r = index == k - if np.sum(r)>0: - mu[k] = np.mean(x[index==k], axis=0) + if np.sum(r) > 0: + mu[k] = np.mean(x[index == k], axis=0) return mu - def predict(self, x): err2 = np.zeros((x.shape[0], self.num_clusters), dtype=float_cpu()) for k in range(self.num_clusters): - err2[:,k] = np.sum(np.square(x-self.mu[k]), axis=-1) + err2[:, k] = np.sum(np.square(x - self.mu[k]), axis=-1) index = np.argmin(err2, axis=-1) - return index, err2[np.arange(x.shape[0]),index] - - + return index, err2[np.arange(x.shape[0]), index] diff --git a/hyperion/diarization/diar_ahc_plda.py b/hyperion/diarization/diar_ahc_plda.py index 5aade454..018334c2 100644 --- a/hyperion/diarization/diar_ahc_plda.py +++ b/hyperion/diarization/diar_ahc_plda.py @@ -8,7 +8,8 @@ import numpy as np import h5py import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt @@ -16,10 +17,17 @@ from ..pdfs import GMMTiedDiagCov as GMM from ..transforms import PCA, LNorm + class DiarAHCPLDA(object): - - def __init__(self, plda_model, preproc=None, - threshold=0, pca_var_r=1, do_unsup_cal=False, use_bic=False): + def __init__( + self, + plda_model, + preproc=None, + threshold=0, + pca_var_r=1, + do_unsup_cal=False, + use_bic=False, + ): self.plda_model = plda_model self.preproc = preproc @@ -29,7 +37,6 @@ def __init__(self, plda_model, preproc=None, self.use_bic = use_bic and do_unsup_cal self._ahc = AHC() - @staticmethod def _plot_score_hist(scores, output_file, thr=None, gmm=None): @@ -38,51 +45,61 @@ def _plot_score_hist(scores, output_file, thr=None, gmm=None): mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) scores_r = scores[mask].ravel() - - _, bins, _ = plt.hist(scores_r, 100, - histtype='step', density=True, color='b', - linestyle='solid', linewidth=1.5) + + _, bins, _ = plt.hist( + scores_r, + 100, + histtype="step", + density=True, + color="b", + linestyle="solid", + linewidth=1.5, + ) if thr is not None: - plt.axvline(x=thr, color='k') + plt.axvline(x=thr, color="k") if gmm is not None: - prob = np.exp(gmm.log_prob(bins[:,None])) - plt.plot(bins, prob, color='r', - linestyle='solid', linewidth=1.5) - - #plt.title(name) - plt.xlabel('LLR score') + prob = np.exp(gmm.log_prob(bins[:, None])) + plt.plot(bins, prob, color="r", linestyle="solid", linewidth=1.5) + + # plt.title(name) + plt.xlabel("LLR score") plt.grid(True) - #plt.legend() + # plt.legend() plt.savefig(output_file) plt.clf() - @staticmethod def _unsup_gmm_calibration(scores): mask = np.triu(np.ones(scores.shape, dtype=np.bool), 1) - scores_r = scores[mask].ravel()[:,None] # N x 1 + scores_r = scores[mask].ravel()[:, None] # N x 1 gmm_1c = GMM(num_comp=1) gmm_1c.fit(scores_r, epochs=1) gmm_2c = gmm_1c.split_comp(2) e = gmm_2c.fit(scores_r, epochs=20) scale = (gmm_2c.mu[0] - gmm_2c.mu[1]) * gmm_2c.Lambda - bias = (gmm_2c.mu[1]**2 - gmm_2c.mu[0]**2) * gmm_2c.Lambda / 2 + np.log(gmm_2c.pi[0]) - np.log(gmm_2c.pi[1]) + bias = ( + (gmm_2c.mu[1] ** 2 - gmm_2c.mu[0] ** 2) * gmm_2c.Lambda / 2 + + np.log(gmm_2c.pi[0]) + - np.log(gmm_2c.pi[1]) + ) scores = scale * scores + bias bic_lambda = 1 n = len(scores_r) dparams = 4 - bic = np.mean(gmm_2c.log_prob(scores_r) - gmm_1c.log_prob(scores_r)) - bic_lambda * dparams * np.log(n)/2/n + bic = ( + np.mean(gmm_2c.log_prob(scores_r) - gmm_1c.log_prob(scores_r)) + - bic_lambda * dparams * np.log(n) / 2 / n + ) return scores, bic, gmm_2c - def cluster(self, x, hist_file=None): x = self.preproc.predict(x) if self.pca_var_r < 1: pca = PCA(pca_var_r=self.pca_var_r, whiten=True) pca.fit(x) - logging.info('PCA dim=%d' % pca.pca_dim) + logging.info("PCA dim=%d" % pca.pca_dim) x = pca.predict(x) x = LNorm().predict(x) plda_model = self.plda_model.project(pca.T, pca.mu) @@ -92,65 +109,63 @@ def cluster(self, x, hist_file=None): scores = plda_model.llr_1vs1(x, x) if self.do_unsup_cal: scores_cal, bic, gmm_2c = self._unsup_gmm_calibration(scores) - logging.info('UnsupCal. BIC={} gmm.pi={} gmm.mu={} gmm.sigma={}'.format( - bic, gmm_2c.pi, gmm_2c.mu, np.sqrt(1./gmm_2c.Lambda))) + logging.info( + "UnsupCal. BIC={} gmm.pi={} gmm.mu={} gmm.sigma={}".format( + bic, gmm_2c.pi, gmm_2c.mu, np.sqrt(1.0 / gmm_2c.Lambda) + ) + ) if hist_file: - hist_file_1 = '%s-nocal.pdf' % hist_file + hist_file_1 = "%s-nocal.pdf" % hist_file self._plot_score_hist(scores, hist_file_1, None, gmm_2c) scores = scores_cal if hist_file: - hist_file_1 = '%s.pdf' % hist_file + hist_file_1 = "%s.pdf" % hist_file self._plot_score_hist(scores, hist_file_1, self.threshold) if self.use_bic and bic < 0: # unsup calibration detected only one Gaussian -> only target trials class_ids = np.zeros(len(x), dtype=np.int) return class_ids - + self._ahc.fit(scores) class_ids = self._ahc.get_flat_clusters(self.threshold) return class_ids - @staticmethod def filter_args(**kwargs): """Filters diarization args from arguments dictionary. - - Args: - prefix: Options prefix. - kwargs: Arguments dictionary. - - Returns: - Dictionary with diarization options. + + Args: + prefix: Options prefix. + kwargs: Arguments dictionary. + + Returns: + Dictionary with diarization options. """ - valid_args = ('threshold', 'pca_var_r', 'do_unsup_cal', 'use_bic') - - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("threshold", "pca_var_r", "do_unsup_cal", "use_bic") + + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return d - - @staticmethod def add_class_args(parser, prefix=None): """Adds diarization options to parser. - - Args: - parser: Arguments parser - prefix: Options prefix. + + Args: + parser: Arguments parser + prefix: Options prefix. """ if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'threshold', default=0, type=float) - parser.add_argument(p1+'pca-var-r', default=1, type=float) - parser.add_argument(p1+'do-unsup-cal', default=False, action='store_true') - parser.add_argument(p1+'use-bic', default=False, action='store_true') + p1 = "--" + prefix + "." + parser.add_argument(p1 + "threshold", default=0, type=float) + parser.add_argument(p1 + "pca-var-r", default=1, type=float) + parser.add_argument(p1 + "do-unsup-cal", default=False, action="store_true") + parser.add_argument(p1 + "use-bic", default=False, action="store_true") add_argparse_args = add_class_args diff --git a/hyperion/feats/__init__.py b/hyperion/feats/__init__.py index 81c3302c..9d77e032 100644 --- a/hyperion/feats/__init__.py +++ b/hyperion/feats/__init__.py @@ -11,4 +11,3 @@ from .energy_vad import EnergyVAD from .frame_selector import FrameSelector from .feature_normalization import MeanVarianceNorm - diff --git a/hyperion/feats/energy_vad.py b/hyperion/feats/energy_vad.py index 7fb37062..734e86bb 100644 --- a/hyperion/feats/energy_vad.py +++ b/hyperion/feats/energy_vad.py @@ -15,23 +15,31 @@ class EnergyVAD(object): """Compute VAD based on Kaldi Energy VAD method. - Attributes: - sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) - frame_length: Frame length in milliseconds (default = 25) - frame_shift: Frame shift in milliseconds (default = 10) - dither: Dithering constant (0.0 means no dither) (default = 1) - snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) - vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) - vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) - vad_frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0) - vad_proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6) + Attributes: + sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + dither: Dithering constant (0.0 means no dither) (default = 1) + snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) + vad_energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) + vad_energy_threshold: Constant term in energy threshold for MFCC0 for VAD (also see --vad-energy-mean-scale) (float, default = 5) + vad_frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0) + vad_proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6) """ - def __init__(self, sample_frequency=16000, frame_length=25, frame_shift=10, - dither=1, snip_edges=True, - vad_energy_mean_scale=0.5, - vad_energy_threshold=5, vad_frames_context=0, - vad_proportion_threshold=0.6): - + + def __init__( + self, + sample_frequency=16000, + frame_length=25, + frame_shift=10, + dither=1, + snip_edges=True, + vad_energy_mean_scale=0.5, + vad_energy_threshold=5, + vad_frames_context=0, + vad_proportion_threshold=0.6, + ): + self.sample_frequency = sample_frequency fs = sample_frequency self.fs = fs @@ -40,183 +48,211 @@ def __init__(self, sample_frequency=16000, frame_length=25, frame_shift=10, self.dither = dither self.snip_edges = snip_edges - N = int(np.floor(frame_length * fs/1000)) + N = int(np.floor(frame_length * fs / 1000)) self._length = N - self._shift = int(np.floor(frame_shift * fs/1000)) + self._shift = int(np.floor(frame_shift * fs / 1000)) self._dc_a = np.array([1, -0.999], dtype=float_cpu()) self._dc_b = np.array([1, -1], dtype=float_cpu()) - assert vad_energy_mean_scale >=0 - assert vad_frames_context >=0 + assert vad_energy_mean_scale >= 0 + assert vad_frames_context >= 0 assert vad_proportion_threshold > 0 and vad_proportion_threshold < 1 - + self.vad_energy_mean_scale = vad_energy_mean_scale self.vad_energy_threshold = vad_energy_threshold self.vad_frames_context = vad_frames_context self.vad_proportion_threshold = vad_proportion_threshold - - self.reset() + self.reset() def reset(self): - """Resets the internal states of the filters """ + """Resets the internal states of the filters""" self._dc_zi = np.array([0], dtype=float_cpu()) - def compute(self, x, return_loge=False): - """ Evaluates the VAD. + """Evaluates the VAD. - Args: - x: Wave - return_loge: If true, it also returns the log-energy. + Args: + x: Wave + return_loge: If true, it also returns the log-energy. - Returns: - Binary VAD + Returns: + Binary VAD """ - if x.ndim==1: + if x.ndim == 1: # Input is wave if self.snip_edges: - num_frames = int(np.floor((len(x) - self._length + self._shift)/self._shift)) + num_frames = int( + np.floor((len(x) - self._length + self._shift) / self._shift) + ) else: - num_frames = int(np.round(len(x)/self._shift)) - len_x = (num_frames-1)*self._shift + self._length + num_frames = int(np.round(len(x) / self._shift)) + len_x = (num_frames - 1) * self._shift + self._length dlen_x = len_x - len(x) - dlen1_x = int(np.floor((self._length-self._shift)/2)) + dlen1_x = int(np.floor((self._length - self._shift) / 2)) dlen2_x = int(dlen_x - dlen1_x) - x = np.pad(x, (dlen1_x, dlen2_x), mode='reflect') + x = np.pad(x, (dlen1_x, dlen2_x), mode="reflect") # add dither if self.dither > 0: - n = self.dither*np.random.RandomState( - seed=len(x)).randn(len(x)).astype(float_cpu(), copy=False) + n = self.dither * np.random.RandomState(seed=len(x)).randn( + len(x) + ).astype(float_cpu(), copy=False) x = x + n - + x, self._dc_zi = lfilter(self._dc_b, self._dc_a, x, zi=self._dc_zi) # Compute raw energy logE = st_logE(x, self._length, self._shift) - elif x.ndim==2: + elif x.ndim == 2: # Assume that input are features with log-e in the first coeff of the vector logE = x[:, 0] else: - raise Exception('Wrong input dimension ndim=%d' % x.ndim) - + raise Exception("Wrong input dimension ndim=%d" % x.ndim) # compute VAD from logE - #print(np.mean(logE)) + # print(np.mean(logE)) e_thr = self.vad_energy_threshold + self.vad_energy_mean_scale * np.mean(logE) - #print(e_thr) - #print(logE) - vad = (logE > e_thr) + # print(e_thr) + # print(logE) + vad = logE > e_thr context = self.vad_frames_context if context == 0: return vad - + window = 2 * context + 1 if len(vad) < window: - context = int(len(vad)-1/2) + context = int(len(vad) - 1 / 2) window = 2 * context + 1 if window == 1: return vad - h = np.ones((window,), dtype='float32') - num_count = np.convolve(vad.astype('float32'), h, 'same') - den_count_boundary = np.arange(context+1, window, dtype='float32') + h = np.ones((window,), dtype="float32") + num_count = np.convolve(vad.astype("float32"), h, "same") + den_count_boundary = np.arange(context + 1, window, dtype="float32") num_count[:context] /= den_count_boundary num_count[-context:] /= den_count_boundary[::-1] num_count[context:-context] /= window - + vad = num_count > self.vad_proportion_threshold return vad - @staticmethod def filter_args(**kwargs): """Filters VAD args from arguments dictionary. - - Args: - kwargs: Arguments dictionary. - - Returns: - Dictionary with VAD options. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with VAD options. """ - valid_args = ('sample_frequency', 'frame_length', 'frame_shift', - 'dither', 'snip_edges', - 'vad_energy_mean_scale', 'vad_energy_threshold', - 'vad_frames_context', 'vad_proportion_threshold') - - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ( + "sample_frequency", + "frame_length", + "frame_shift", + "dither", + "snip_edges", + "vad_energy_mean_scale", + "vad_energy_threshold", + "vad_frames_context", + "vad_proportion_threshold", + ) + + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return d - - + @staticmethod def add_class_args(parser, prefix=None): """Adds VAD options to parser. - - Args: - parser: Arguments parser - prefix: Options prefix. + + Args: + parser: Arguments parser + prefix: Options prefix. """ if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1+'sample-frequency', - default=16000, type=int, - help=('Waveform data sample frequency ' - '(must match the waveform file, if specified there)')) - + p1 + "sample-frequency", + default=16000, + type=int, + help=( + "Waveform data sample frequency " + "(must match the waveform file, if specified there)" + ), + ) + parser.add_argument( - p1+'frame-length', type=int, + p1 + "frame-length", + type=int, default=25, - help='Frame length in milliseconds') + help="Frame length in milliseconds", + ) parser.add_argument( - p1+'frame-shift', type=int, - default=10, - help='Frame shift in milliseconds') + p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + ) parser.add_argument( - p1+'dither', type=float, + p1 + "dither", + type=float, default=1, - help='Dithering constant (0.0 means no dither)') - + help="Dithering constant (0.0 means no dither)", + ) + parser.add_argument( - p1+'snip-edges', - default=True, type=str2bool, - help=('If true, end effects will be handled by outputting only ' - 'frames that completely fit in the file, and the number of ' - 'frames depends on the frame-length. ' - 'If false, the number of frames depends only on the ' - 'frame-shift, and we reflect the data at the ends.')) + p1 + "snip-edges", + default=True, + type=str2bool, + help=( + "If true, end effects will be handled by outputting only " + "frames that completely fit in the file, and the number of " + "frames depends on the frame-length. " + "If false, the number of frames depends only on the " + "frame-shift, and we reflect the data at the ends." + ), + ) parser.add_argument( - p1+'vad-energy-mean-scale', type=float, + p1 + "vad-energy-mean-scale", + type=float, default=0.5, - help=('If this is set to s, to get the actual threshold we let m ' - 'be the mean log-energy of the file, and use ' - 's*m + vad-energy-threshold')) + help=( + "If this is set to s, to get the actual threshold we let m " + "be the mean log-energy of the file, and use " + "s*m + vad-energy-threshold" + ), + ) parser.add_argument( - p1+'vad-energy-threshold', type=float, + p1 + "vad-energy-threshold", + type=float, default=5, - help='Constant term in energy threshold for MFCC0 for VAD') + help="Constant term in energy threshold for MFCC0 for VAD", + ) parser.add_argument( - p1+'vad-frames-context', type=int, + p1 + "vad-frames-context", + type=int, default=0, - help=('Number of frames of context on each side of central frame, ' - 'in window for which energy is monitored')) + help=( + "Number of frames of context on each side of central frame, " + "in window for which energy is monitored" + ), + ) parser.add_argument( - p1+'vad-proportion-threshold', type=float, + p1 + "vad-proportion-threshold", + type=float, default=0.6, - help=('Parameter controlling the proportion of frames within ' - 'the window that need to have more energy than the threshold')) - - + help=( + "Parameter controlling the proportion of frames within " + "the window that need to have more energy than the threshold" + ), + ) + add_argparse_args = add_class_args diff --git a/hyperion/feats/feature_normalization.py b/hyperion/feats/feature_normalization.py index 1c33b651..313d027b 100644 --- a/hyperion/feats/feature_normalization.py +++ b/hyperion/feats/feature_normalization.py @@ -12,27 +12,27 @@ class MeanVarianceNorm(object): """Class to perform mean and variance normalization - + Attributes: norm_mean: normalize mean norm_var: normalize variance left_context: past context of the sliding window, if None all past frames. right_context: future context of the sliding window, if None all future frames. - + If left_context==right_context==None, it will apply global mean/variance normalization. """ - def __init__(self, norm_mean=True, norm_var=False, left_context=None, right_context=None): + + def __init__( + self, norm_mean=True, norm_var=False, left_context=None, right_context=None + ): self.norm_mean = norm_mean self.norm_var = norm_var self.left_context = left_context self.right_context = right_context - - def normalize(self, x): return self.normalize_cumsum(x) - def normalize_global(self, x): # Global mean/var norm. if self.norm_mean: @@ -41,11 +41,9 @@ def normalize_global(self, x): if self.norm_var: s_x = np.std(x, axis=0, keepdims=True) - x = x/s_x + x = x / s_x return x - - def normalize_conv(self, x): """Normalize featurex in x @@ -58,7 +56,7 @@ def normalize_conv(self, x): """ x = self.normalize_global(x) - + if self.right_context is None and self.left_context is None: return x @@ -74,25 +72,24 @@ def normalize_conv(self, x): total_context = left_context + right_context + 1 - if x.shape[0] <= min(right_context, left_context)+1: + if x.shape[0] <= min(right_context, left_context) + 1: # if context is larger than the signal we still return global normalization return x - v1 = np.ones((x.shape[0],1), dtype=float_cpu()) - h = np.ones((total_context,1), dtype=float_cpu()) - - counts = convolve2d(v1, h)[right_context:right_context+x.shape[0]] - m_x = convolve2d(x, h)[right_context:right_context+x.shape[0]] + v1 = np.ones((x.shape[0], 1), dtype=float_cpu()) + h = np.ones((total_context, 1), dtype=float_cpu()) + + counts = convolve2d(v1, h)[right_context : right_context + x.shape[0]] + m_x = convolve2d(x, h)[right_context : right_context + x.shape[0]] m_x /= counts if self.norm_var: - m2_x = convolve2d(x*x, h)[right_context:right_context+x.shape[0]] + m2_x = convolve2d(x * x, h)[right_context : right_context + x.shape[0]] m2_x /= counts - s2_x = m2_x - m_x**2 - s2_x[s2_x<1e-5] = 1e-5 + s2_x = m2_x - m_x ** 2 + s2_x[s2_x < 1e-5] = 1e-5 s_x = np.sqrt(s2_x) - if self.norm_mean: x -= m_x @@ -101,8 +98,6 @@ def normalize_conv(self, x): return x - - def normalize_cumsum(self, x): """Normalize featurex in x Uses cumsum @@ -130,40 +125,60 @@ def normalize_cumsum(self, x): total_context = left_context + right_context + 1 - if x.shape[0] <= min(right_context, left_context)+1: + if x.shape[0] <= min(right_context, left_context) + 1: # if context is larger than the signal we still return global normalization return x - c_x = np.zeros((x.shape[0]+total_context, x.shape[1],), dtype=float_cpu()) - counts = np.zeros((x.shape[0]+total_context, 1,), dtype=float_cpu()) - - c_x[left_context+1:left_context+x.shape[0]+1] = np.cumsum(x, axis=0) - c_x[left_context+x.shape[0]+1:] = c_x[left_context+x.shape[0]] - counts[left_context+1:left_context+x.shape[0]+1] = np.arange(1, x.shape[0]+1, dtype=float_cpu())[:,None] - counts[left_context+x.shape[0]+1:] = x.shape[0] + c_x = np.zeros( + ( + x.shape[0] + total_context, + x.shape[1], + ), + dtype=float_cpu(), + ) + counts = np.zeros( + ( + x.shape[0] + total_context, + 1, + ), + dtype=float_cpu(), + ) + + c_x[left_context + 1 : left_context + x.shape[0] + 1] = np.cumsum(x, axis=0) + c_x[left_context + x.shape[0] + 1 :] = c_x[left_context + x.shape[0]] + counts[left_context + 1 : left_context + x.shape[0] + 1] = np.arange( + 1, x.shape[0] + 1, dtype=float_cpu() + )[:, None] + counts[left_context + x.shape[0] + 1 :] = x.shape[0] if self.norm_var: - c2_x = np.zeros((x.shape[0]+total_context, x.shape[1],), dtype=float_cpu()) - c2_x[left_context+1:left_context+x.shape[0]+1] = np.cumsum(x*x, axis=0) - c2_x[left_context+x.shape[0]+1:] = c2_x[left_context+x.shape[0]] + c2_x = np.zeros( + ( + x.shape[0] + total_context, + x.shape[1], + ), + dtype=float_cpu(), + ) + c2_x[left_context + 1 : left_context + x.shape[0] + 1] = np.cumsum( + x * x, axis=0 + ) + c2_x[left_context + x.shape[0] + 1 :] = c2_x[left_context + x.shape[0]] counts = counts[total_context:] - counts[:-total_context] - m_x = (c_x[total_context:] - c_x[:-total_context])/counts + m_x = (c_x[total_context:] - c_x[:-total_context]) / counts if self.norm_mean: x -= m_x if self.norm_var: - m2_x = (c2_x[total_context:] - c2_x[:-total_context])/counts - s2_x=m2_x - m_x**2 - s2_x[s2_x<1e-5]=1e-5 + m2_x = (c2_x[total_context:] - c2_x[:-total_context]) / counts + s2_x = m2_x - m_x ** 2 + s2_x[s2_x < 1e-5] = 1e-5 s_x = np.sqrt(s2_x) x /= s_x return x - - def normalize_slow(self, x): x = self.normalize_global(x) @@ -180,97 +195,114 @@ def normalize_slow(self, x): right_context = x.shape[0] else: right_context = self.right_context - + m_x = np.zeros_like(x) s_x = np.zeros_like(x) for i in range(x.shape[0]): - idx1 = max(i-left_context, 0) - idx2 = min(i+right_context, x.shape[0]-1) + 1 + idx1 = max(i - left_context, 0) + idx2 = min(i + right_context, x.shape[0] - 1) + 1 denom = idx2 - idx1 m_x[i] = np.mean(x[idx1:idx2], axis=0) s_x[i] = np.std(x[idx1:idx2], axis=0) - if self.norm_mean: x -= m_x if self.norm_var: - s_x[s_x num_frames_vad: if num_frames - num_frames_vad <= self.tol_num_frames: - return x[:num_frames_vad,:][sel,:] + return x[:num_frames_vad, :][sel, :] else: - raise Exception('num_frames (%d) > num_frames_vad (%d) + tol (%d)' - % (num_frames, num_frames_vad, self.tol_num_frames)) + raise Exception( + "num_frames (%d) > num_frames_vad (%d) + tol (%d)" + % (num_frames, num_frames_vad, self.tol_num_frames) + ) else: if num_frames_vad - num_frames <= self.tol_num_frames: - return x[sel[:num_frames],:] + return x[sel[:num_frames], :] else: - raise Exception('num_frames_vad (%d) > num_frames (%d) + tol (%d)' - % (num_frames_vad, num_frames, self.tol_num_frames)) - + raise Exception( + "num_frames_vad (%d) > num_frames (%d) + tol (%d)" + % (num_frames_vad, num_frames, self.tol_num_frames) + ) - @staticmethod def filter_args(**kwargs): """Filters frame selector args from arguments dictionary. - - Args: - prefix: Options prefix. - kwargs: Arguments dictionary. - - Returns: - Dictionary with frame-selector options. + + Args: + prefix: Options prefix. + kwargs: Arguments dictionary. + + Returns: + Dictionary with frame-selector options. """ - valid_args = ('tol_num_frames') + valid_args = "tol_num_frames" - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return d - - @staticmethod def add_class_args(parser, prefix=None): """Adds frame-selector options to parser. - - Args: - parser: Arguments parser - prefix: Options prefix. + + Args: + parser: Arguments parser + prefix: Options prefix. """ if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." - parser.add_argument(p1+'tol-num-frames', type=int, - default=3, - help='maximum tolerated error between number of feature frames and VAD frames.') + parser.add_argument( + p1 + "tol-num-frames", + type=int, + default=3, + help="maximum tolerated error between number of feature frames and VAD frames.", + ) - add_argparse_args = add_class_args diff --git a/hyperion/feats/mfcc.py b/hyperion/feats/mfcc.py index f9b3733b..94af5c2e 100644 --- a/hyperion/feats/mfcc.py +++ b/hyperion/feats/mfcc.py @@ -18,6 +18,7 @@ class MFCCSteps(Enum): """Steps in the MFCC pipeline""" + WAVE = 0 FFT = 1 SPEC = 2 @@ -62,53 +63,56 @@ def __ne__(self, other): class MFCC(object): """Compute MFCC features. - Attributes: - sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) - frame_length: Frame length in milliseconds (default = 25) - frame_shift: Frame shift in milliseconds (default = 10) - fft_length: Length of FFT (default = 512) - remove_dc_offset: Subtract mean from waveform on each frame (default = True) - preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) - window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey') - use_fft2: If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True) - dither: Dithering constant (0.0 means no dither) (default = 1) - fb_type: Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi') - low_freq: Low cutoff frequency for mel bins (default = 20) - high_freq: High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0) - num_filters: Number of triangular mel-frequency bins (default = 23) - norm_filters: Normalize filters coeff to sum up to 1, if librosa it uses stanley norm (default = False) - num_ceps: Number of cepstra in MFCC computation (including C0) (default = 13) - snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) - energy_floor: Floor on energy (absolute, not relative) in MFCC computation (default = 0) - raw_energy: If true, compute energy before preemphasis and windowing (default = True) - use_energy: Use energy (not C0) in MFCC computation (default = True) - cepstral_lifter: Constant that controls scaling of MFCCs (default = 22) - input_step: It can continue computation from any step: wav, fft, spec, logfb (default = 'wav') - output_step: It can return intermediate result: fft, spec, logfb, mfcc (default = 'mfcc') + Attributes: + sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (default = 16000) + frame_length: Frame length in milliseconds (default = 25) + frame_shift: Frame shift in milliseconds (default = 10) + fft_length: Length of FFT (default = 512) + remove_dc_offset: Subtract mean from waveform on each frame (default = True) + preemphasis_coeff: Coefficient for use in signal preemphasis (default = 0.97) + window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"blackmann") (default = 'povey') + use_fft2: If true, it uses |X(f)|^2, if false, it uses |X(f)|, (default = True) + dither: Dithering constant (0.0 means no dither) (default = 1) + fb_type: Filter-bank type: mel_kaldi, mel_etsi, mel_librosa, mel_librosa_htk, linear (default = 'mel_kaldi') + low_freq: Low cutoff frequency for mel bins (default = 20) + high_freq: High cutoff frequency for mel bins (if < 0, offset from Nyquist) (default = 0) + num_filters: Number of triangular mel-frequency bins (default = 23) + norm_filters: Normalize filters coeff to sum up to 1, if librosa it uses stanley norm (default = False) + num_ceps: Number of cepstra in MFCC computation (including C0) (default = 13) + snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (default = True) + energy_floor: Floor on energy (absolute, not relative) in MFCC computation (default = 0) + raw_energy: If true, compute energy before preemphasis and windowing (default = True) + use_energy: Use energy (not C0) in MFCC computation (default = True) + cepstral_lifter: Constant that controls scaling of MFCCs (default = 22) + input_step: It can continue computation from any step: wav, fft, spec, logfb (default = 'wav') + output_step: It can return intermediate result: fft, spec, logfb, mfcc (default = 'mfcc') """ - def __init__(self, - sample_frequency=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemphasis_coeff=0.97, - window_type='povey', - use_fft2=True, - dither=1, - fb_type='mel_kaldi', - low_freq=20, - high_freq=0, - num_filters=23, - norm_filters=False, - num_ceps=13, - snip_edges=True, - energy_floor=0, - raw_energy=True, - use_energy=True, - cepstral_lifter=22, - input_step='wave', - output_step='mfcc'): + + def __init__( + self, + sample_frequency=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemphasis_coeff=0.97, + window_type="povey", + use_fft2=True, + dither=1, + fb_type="mel_kaldi", + low_freq=20, + high_freq=0, + num_filters=23, + norm_filters=False, + num_ceps=13, + snip_edges=True, + energy_floor=0, + raw_energy=True, + use_energy=True, + cepstral_lifter=22, + input_step="wave", + output_step="mfcc", + ): self.fs = sample_frequency self.sample_frequency = sample_frequency @@ -119,7 +123,7 @@ def __init__(self, self.remove_dc_offset = remove_dc_offset self.preemphasis_coeff = preemphasis_coeff self.window_type = window_type - #self.blackman_coeff = blackman_coeff + # self.blackman_coeff = blackman_coeff self.use_fft2 = use_fft2 self.dither = dither self.fb_type = fb_type @@ -136,10 +140,8 @@ def __init__(self, self.input_step = input_step self.output_step = output_step - assert input_step in mfcc_steps_dict, 'Invalid input step %s' % ( - input_step) - assert output_step in mfcc_steps_dict, 'Invalid output step %s' % ( - output_step) + assert input_step in mfcc_steps_dict, "Invalid input step %s" % (input_step) + assert output_step in mfcc_steps_dict, "Invalid output step %s" % (output_step) self._input_step = mfcc_steps_dict[input_step] self._output_step = mfcc_steps_dict[output_step] @@ -147,7 +149,7 @@ def __init__(self, N = int(np.floor(frame_length * fs / 1000)) if N > fft_length: k = np.ceil(np.log(N) / np.log(2)) - self.fft_length = int(2**k) + self.fft_length = int(2 ** k) self._length = N self._shift = int(np.floor(frame_shift * fs / 1000)) @@ -155,32 +157,32 @@ def __init__(self, self._dc_a = np.array([1, -0.999], dtype=float_cpu()) self._dc_b = np.array([1, -1], dtype=float_cpu()) - self._preemph_b = np.array([1, -self.preemphasis_coeff], - dtype=float_cpu()) + self._preemph_b = np.array([1, -self.preemphasis_coeff], dtype=float_cpu()) self._window = FWF.create(window_type, N) # corrects scipy.stft scales fft by 1/sum(window) # self._fft_scale = np.sum(self._window) - self._fb = FBF.create(fb_type, num_filters, self.fft_length, fs, - low_freq, high_freq, norm_filters) + self._fb = FBF.create( + fb_type, num_filters, self.fft_length, fs, low_freq, high_freq, norm_filters + ) self._lifter = MFCC.make_lifter(self.num_ceps, self.cepstral_lifter) self.reset() def reset(self): - """Resets the internal states of the filters """ + """Resets the internal states of the filters""" self._dc_zi = np.array([0], dtype=float_cpu()) self._preemph_zi = np.array([0], dtype=float_cpu()) @staticmethod def make_lifter(N, Q): """Makes the liftering function - - Args: - N: Number of cepstral coefficients. - Q: Liftering parameter - Returns: - Liftering vector. + Args: + N: Number of cepstral coefficients. + Q: Liftering parameter + + Returns: + Liftering vector. """ if Q == 0: return 1 @@ -188,35 +190,35 @@ def make_lifter(N, Q): def compute_raw_logE(self, x): """Computes log-energy before preemphasis filter - - Args: - x: wave signal - Returns: - Log-energy + Args: + x: wave signal + + Returns: + Log-energy """ return st_logE(x, self._length, self._shift) - def compute(self, - x, - return_fft=False, - return_spec=False, - return_logfb=False): - """ Evaluates the MFCC pipeline. - - Args: - x: Wave, stft, spectrogram or log-filter-bank depending on input_step. - return_fft: If true, it also returns short-time fft. - return_spec: If true, it also returns short-time magnitude spectrogram. - return_logfb: If true, it also returns log-filter-bank. - - Returns: - Stfft, spectrogram, log-filter-bank or MFCC depending on output_step. + def compute(self, x, return_fft=False, return_spec=False, return_logfb=False): + """Evaluates the MFCC pipeline. + + Args: + x: Wave, stft, spectrogram or log-filter-bank depending on input_step. + return_fft: If true, it also returns short-time fft. + return_spec: If true, it also returns short-time magnitude spectrogram. + return_logfb: If true, it also returns log-filter-bank. + + Returns: + Stfft, spectrogram, log-filter-bank or MFCC depending on output_step. """ assert not (return_fft and self._input_step > MFCCSteps.FFT) - assert not (return_spec and (self._input_step > MFCCSteps.SPEC - or self._output_step < MFCCSteps.SPEC)) + assert not ( + return_spec + and ( + self._input_step > MFCCSteps.SPEC or self._output_step < MFCCSteps.SPEC + ) + ) assert not (return_logfb and self._output_step < MFCCSteps.LOGFB) # Prepare input @@ -241,29 +243,27 @@ def compute(self, if self._input_step == MFCCSteps.WAVE: if self.snip_edges: num_frames = int( - np.floor( - (len(x) - self._length + self._shift) / self._shift)) + np.floor((len(x) - self._length + self._shift) / self._shift) + ) else: num_frames = int(np.round(len(x) / self._shift)) len_x = (num_frames - 1) * self._shift + self._length dlen_x = len_x - len(x) - #x = np.pad(x, (0, dlen_x), mode='reflect') + # x = np.pad(x, (0, dlen_x), mode='reflect') dlen1_x = int(np.floor((self._length - self._shift) / 2)) dlen2_x = int(dlen_x - dlen1_x) - x = np.pad(x, (dlen1_x, dlen2_x), mode='reflect') + x = np.pad(x, (dlen1_x, dlen2_x), mode="reflect") # add dither if self.dither > 0: n = self.dither * np.random.RandomState(seed=len(x)).randn( - len(x)).astype(float_cpu(), copy=False) + len(x) + ).astype(float_cpu(), copy=False) x = x + n # Remove offset if self.remove_dc_offset: - x, self._dc_zi = lfilter(self._dc_b, - self._dc_a, - x, - zi=self._dc_zi) + x, self._dc_zi = lfilter(self._dc_b, self._dc_a, x, zi=self._dc_zi) # Compute raw energy if self.use_energy and self.raw_energy: @@ -271,14 +271,14 @@ def compute(self, # Apply preemphasis filter if self.preemphasis_coeff > 0: - x, self._preemph_zi = lfilter(self._preemph_b, [1], - x, - zi=self._preemph_zi) + x, self._preemph_zi = lfilter( + self._preemph_b, [1], x, zi=self._preemph_zi + ) - #Comptue STFFT - #_, _, X = stft(x, window=self._window, nperseg=self._nperseg, noverlap=self._overlap, nfft=self.fft_length, boundary=None) + # Comptue STFFT + # _, _, X = stft(x, window=self._window, nperseg=self._nperseg, noverlap=self._overlap, nfft=self.fft_length, boundary=None) # Fix scale of FFT - #X = self._fft_scale * X[:, :num_frames].T + # X = self._fft_scale * X[:, :num_frames].T # xx = [] # j = 0 # for i in range(len(x)//160-2): @@ -289,8 +289,7 @@ def compute(self, # return np.vstack(tuple(xx)) - X = strft(x, self._length, self._shift, self.fft_length, - self._window) + X = strft(x, self._length, self._shift, self.fft_length, self._window) # Compute |X(f)| F = np.abs(X).astype(dtype=float_cpu(), copy=False) @@ -298,26 +297,29 @@ def compute(self, # Compute no-raw energy if self.use_energy and not self.raw_energy: # Use Paserval's theorem - logE = np.log(np.mean(F**2, axis=-1) + 1e-10) + logE = np.log(np.mean(F ** 2, axis=-1) + 1e-10) # Compute |X(f)|^2 if self._input_step <= MFCCSteps.FFT and self._output_step >= MFCCSteps.SPEC: if self.use_fft2: - F = F**2 + F = F ** 2 # Compute log-filter-bank - if self._input_step <= MFCCSteps.LOG_SPEC and self._output_step >= MFCCSteps.LOGFB: + if ( + self._input_step <= MFCCSteps.LOG_SPEC + and self._output_step >= MFCCSteps.LOGFB + ): B = np.log(np.dot(F, self._fb) + 1e-10) - #B = np.maximum(B, np.log(self.energy_floor+1e-15)) + # B = np.maximum(B, np.log(self.energy_floor+1e-15)) # Compute MFCC if self._input_step <= MFCCSteps.LOGFB and self._output_step == MFCCSteps.MFCC: - P = dct(B, type=2, norm='ortho')[:, :self.num_ceps] + P = dct(B, type=2, norm="ortho")[:, : self.num_ceps] if self.cepstral_lifter > 0: P *= self._lifter - #Select the right output type + # Select the right output type if self._output_step == MFCCSteps.FFT: R = X elif self._output_step == MFCCSteps.SPEC: @@ -330,7 +332,7 @@ def compute(self, R = P if self.use_energy: - #append energy + # append energy logE = np.maximum(logE, np.log(self.energy_floor + 1e-15)) if self._output_step == MFCCSteps.LOGFB: R = np.hstack((logE[:, None], R)) @@ -354,20 +356,38 @@ def compute(self, @staticmethod def filter_args(**kwargs): """Filters MFCC args from arguments dictionary. - - Args: - kwargs: Arguments dictionary. - - Returns: - Dictionary with MFCC options. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with MFCC options. """ - valid_args = ('sample_frequency', 'frame_length', 'frame_shift', - 'fft_length', 'remove_dc_offset', 'preemphasis_coeff', - 'window_type', 'blackman_coeff', 'use_fft2', 'dither', - 'fb_type', 'low_freq', 'high_freq', 'num_filters', - 'norm_filters', 'num_ceps', 'snip_edges', 'energy_floor', - 'raw_energy', 'use_energy', 'cepstral_lifter', - 'input_step', 'output_step') + valid_args = ( + "sample_frequency", + "frame_length", + "frame_shift", + "fft_length", + "remove_dc_offset", + "preemphasis_coeff", + "window_type", + "blackman_coeff", + "use_fft2", + "dither", + "fb_type", + "low_freq", + "high_freq", + "num_filters", + "norm_filters", + "num_ceps", + "snip_edges", + "energy_floor", + "raw_energy", + "use_energy", + "cepstral_lifter", + "input_step", + "output_step", + ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return d @@ -375,112 +395,133 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): """Adds MFCC options to parser. - - Args: - parser: Arguments parser - prefix: Options prefix. + + Args: + parser: Arguments parser + prefix: Options prefix. """ if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1 + 'sample-frequency', + p1 + "sample-frequency", default=16000, type=int, - help='Waveform data sample frequency ' - '(must match the waveform file, if specified there)') - - parser.add_argument(p1 + 'frame-length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument(p1 + 'frame-shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument(p1 + 'fft-length', - type=int, - default=512, - help='Length of FFT') - - parser.add_argument(p1 + 'remove-dc-offset', - default=True, - type=str2bool, - help='Subtract mean from waveform on each frame') - - parser.add_argument(p1 + 'preemphasis-coeff', - type=float, - default=0.97, - help='Coefficient for use in signal preemphasis') + help="Waveform data sample frequency " + "(must match the waveform file, if specified there)", + ) + + parser.add_argument( + p1 + "frame-length", + type=int, + default=25, + help="Frame length in milliseconds", + ) + parser.add_argument( + p1 + "frame-shift", type=int, default=10, help="Frame shift in milliseconds" + ) + parser.add_argument( + p1 + "fft-length", type=int, default=512, help="Length of FFT" + ) + + parser.add_argument( + p1 + "remove-dc-offset", + default=True, + type=str2bool, + help="Subtract mean from waveform on each frame", + ) + + parser.add_argument( + p1 + "preemphasis-coeff", + type=float, + default=0.97, + help="Coefficient for use in signal preemphasis", + ) FWF.add_class_args(parser, prefix) parser.add_argument( - p1 + 'use-fft2', + p1 + "use-fft2", default=True, type=str2bool, - help='If true, it uses |X(f)|^2, if false, it uses |X(f)|') + help="If true, it uses |X(f)|^2, if false, it uses |X(f)|", + ) - parser.add_argument(p1 + 'dither', - type=float, - default=1, - help='Dithering constant (0.0 means no dither)') + parser.add_argument( + p1 + "dither", + type=float, + default=1, + help="Dithering constant (0.0 means no dither)", + ) FBF.add_class_args(parser, prefix) parser.add_argument( - p1 + 'num-ceps', + p1 + "num-ceps", type=int, default=13, - help='Number of cepstra in MFCC computation (including C0)') + help="Number of cepstra in MFCC computation (including C0)", + ) parser.add_argument( - p1 + 'snip-edges', + p1 + "snip-edges", default=True, type=str2bool, - help=('If true, end effects will be handled by outputting ' - 'only frames that completely fit in the file, and the ' - 'number of frames depends on the frame-length. ' - 'If false, the number of frames depends only on the ' - 'frame-shift, and we reflect the data at the ends.')) + help=( + "If true, end effects will be handled by outputting " + "only frames that completely fit in the file, and the " + "number of frames depends on the frame-length. " + "If false, the number of frames depends only on the " + "frame-shift, and we reflect the data at the ends." + ), + ) parser.add_argument( - p1 + 'energy-floor', + p1 + "energy-floor", type=float, default=0, - help='Floor on energy (absolute, not relative) in MFCC computation' + help="Floor on energy (absolute, not relative) in MFCC computation", ) parser.add_argument( - p1 + 'raw-energy', + p1 + "raw-energy", + default=True, + type=str2bool, + help="If true, compute energy before preemphasis and windowing", + ) + parser.add_argument( + p1 + "use-energy", default=True, type=str2bool, - help='If true, compute energy before preemphasis and windowing') - parser.add_argument(p1 + 'use-energy', - default=True, - type=str2bool, - help='Use energy (not C0) in MFCC computation') + help="Use energy (not C0) in MFCC computation", + ) - parser.add_argument(p1 + 'cepstral-lifter', - type=float, - default=22, - help='Constant that controls scaling of MFCCs') + parser.add_argument( + p1 + "cepstral-lifter", + type=float, + default=22, + help="Constant that controls scaling of MFCCs", + ) parser.add_argument( - p1 + 'input-step', - default='wave', - choices=['wave', 'fft', 'spec', 'log_spec', 'logfb'], - help=('It can continue computation from any step: ' - 'wav, fft, spec, logfb')) + p1 + "input-step", + default="wave", + choices=["wave", "fft", "spec", "log_spec", "logfb"], + help=( + "It can continue computation from any step: " "wav, fft, spec, logfb" + ), + ) parser.add_argument( - p1 + 'output-step', - default='mfcc', - choices=['fft', 'spec', 'log_spec', 'logfb', 'mfcc'], - help=('It can return intermediate result: ' - 'fft, spec, log_spec, logfb, mfcc')) + p1 + "output-step", + default="mfcc", + choices=["fft", "spec", "log_spec", "logfb", "mfcc"], + help=( + "It can return intermediate result: " "fft, spec, log_spec, logfb, mfcc" + ), + ) add_argparse_args = add_class_args diff --git a/hyperion/feats/stft.py b/hyperion/feats/stft.py index 8e9163bf..7f22bdee 100644 --- a/hyperion/feats/stft.py +++ b/hyperion/feats/stft.py @@ -9,18 +9,19 @@ from ..hyp_defs import float_cpu + def stft(x, frame_length, frame_shift, fft_length, window=None): if window is None: window = 1 - - num_frames = int(np.floor((len(x) - frame_length + frame_shift)/frame_shift)) - X = np.zeros((num_frames, fft_length), dtype='complex64') + + num_frames = int(np.floor((len(x) - frame_length + frame_shift) / frame_shift)) + X = np.zeros((num_frames, fft_length), dtype="complex64") j = 0 for i in range(num_frames): - X[i,:] = np.fft.fft(x[j:j+frame_length]*window, n=fft_length) + X[i, :] = np.fft.fft(x[j : j + frame_length] * window, n=fft_length) j += frame_shift - + return X @@ -29,40 +30,37 @@ def istft(X, frame_length, frame_shift, window=None): if window is None: window = np.ones((frame_length,), dtype=float_cpu()) - num_samples = (X.shape[0] - 1)*frame_shift + frame_length - x_overlap = np.zeros((num_samples,), dtype='complex64') + num_samples = (X.shape[0] - 1) * frame_shift + frame_length + x_overlap = np.zeros((num_samples,), dtype="complex64") w_overlap = np.zeros((num_samples,), dtype=float_cpu()) - xx = np.fft.ifft(X, axis=-1)[:,:frame_length] + xx = np.fft.ifft(X, axis=-1)[:, :frame_length] j = 0 for i in range(X.shape[0]): - x_overlap[j:j+frame_length] += xx[i] - w_overlap[j:j+frame_length] += window + x_overlap[j : j + frame_length] += xx[i] + w_overlap[j : j + frame_length] += window j += frame_shift - w_overlap[w_overlap==0] = 1 - iw = 1/w_overlap + w_overlap[w_overlap == 0] = 1 + iw = 1 / w_overlap # iw[w_overlap==0] = 0 x = x_overlap * iw return x - - def strft(x, frame_length, frame_shift, fft_length, window=None): if window is None: window = 1 - - num_frames = int(np.floor((len(x) - frame_length + frame_shift)/frame_shift)) - X = np.zeros((num_frames, int(fft_length/2+1)), dtype='complex64') + + num_frames = int(np.floor((len(x) - frame_length + frame_shift) / frame_shift)) + X = np.zeros((num_frames, int(fft_length / 2 + 1)), dtype="complex64") j = 0 for i in range(num_frames): - X[i,:] = np.fft.rfft(x[j:j+frame_length]*window, n=fft_length) + X[i, :] = np.fft.rfft(x[j : j + frame_length] * window, n=fft_length) j += frame_shift - - return X + return X def istrft(X, frame_length, frame_shift, window=None): @@ -70,42 +68,41 @@ def istrft(X, frame_length, frame_shift, window=None): if window is None: window = np.ones((frame_length,), dtype=float_cpu()) - num_samples = (X.shape[0] - 1)*frame_shift + frame_length + num_samples = (X.shape[0] - 1) * frame_shift + frame_length x_overlap = np.zeros((num_samples,), dtype=float_cpu()) w_overlap = np.zeros((num_samples,), dtype=float_cpu()) - xx = np.fft.irfft(X, axis=-1)[:,:frame_length] + xx = np.fft.irfft(X, axis=-1)[:, :frame_length] j = 0 for i in range(X.shape[0]): - x_overlap[j:j+frame_length] += xx[i] - w_overlap[j:j+frame_length] += window + x_overlap[j : j + frame_length] += xx[i] + w_overlap[j : j + frame_length] += window j += frame_shift - w_overlap[w_overlap==0] = 1 - iw = 1/w_overlap + w_overlap[w_overlap == 0] = 1 + iw = 1 / w_overlap # iw[w_overlap==0] = 0 x = x_overlap * iw return x - def st_logE(x, frame_length, frame_shift): """Computes log-energy before preemphasis filter - Args: - x: wave signal + Args: + x: wave signal - Returns: - Log-energy - """ - - num_frames = int(np.floor((len(x) - frame_length + frame_shift)/frame_shift)) - - x2 = x**2 + Returns: + Log-energy + """ + + num_frames = int(np.floor((len(x) - frame_length + frame_shift) / frame_shift)) + + x2 = x ** 2 e = np.zeros((num_frames,), dtype=float_cpu()) j = 0 for i in range(num_frames): - e[i] = np.sum(x2[j:j+frame_length]) + e[i] = np.sum(x2[j : j + frame_length]) j += frame_shift - - return np.log(e+1e-15) + + return np.log(e + 1e-15) diff --git a/hyperion/helpers/__init__.py b/hyperion/helpers/__init__.py index e832670c..eeaf2cce 100644 --- a/hyperion/helpers/__init__.py +++ b/hyperion/helpers/__init__.py @@ -11,8 +11,8 @@ from .multi_test_trial_data_reader_v2 import MultiTestTrialDataReaderV2 from .classif_trial_data_reader import ClassifTrialDataReader -#from .sequence_reader import SequenceReader -#from .sequence_class_reader import SequenceClassReader -#from .sequence_post_reader import SequencePostReader -#from .sequence_post_class_reader import SequencePostClassReader +# from .sequence_reader import SequenceReader +# from .sequence_class_reader import SequenceClassReader +# from .sequence_post_reader import SequencePostReader +# from .sequence_post_class_reader import SequencePostClassReader from .plda_factory import PLDAFactory diff --git a/hyperion/helpers/classif_trial_data_reader.py b/hyperion/helpers/classif_trial_data_reader.py index 1779163b..f5d74640 100644 --- a/hyperion/helpers/classif_trial_data_reader.py +++ b/hyperion/helpers/classif_trial_data_reader.py @@ -16,21 +16,31 @@ from ..utils import TrialNdx, SCPList from ..transforms import TransformList + class ClassifTrialDataReader(object): """ Loads data to eval classification problems (deprecated) """ - def __init__(self, v_file, class2int_file, test_file, - preproc, v_field='', seg_idx=1, num_seg_parts=1): + + def __init__( + self, + v_file, + class2int_file, + test_file, + preproc, + v_field="", + seg_idx=1, + num_seg_parts=1, + ): self.r = HypDataReader(v_file) self.preproc = preproc self.field = v_field - with open(class2int_file, 'r') as f: + with open(class2int_file, "r") as f: model_set = [line.rstrip().split()[0] for line in f] - with open(test_file, 'r') as f: + with open(test_file, "r") as f: seg_set = [line.rstrip().split()[0] for line in f] ndx = TrialNdx(model_set, seg_set) @@ -40,8 +50,6 @@ def __init__(self, v_file, class2int_file, test_file, self.ndx = ndx - - def read(self): x_t = self.r.read(self.ndx.seg_set, self.field, return_tensor=True) if self.preproc is not None: @@ -49,33 +57,39 @@ def read(self): return x_t, self.ndx - @staticmethod def filter_args(**kwargs): - valid_args = ('v_field', 'seg_idx', 'num_seg_parts') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("v_field", "seg_idx", "num_seg_parts") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '.' - p2 = prefix + '.' + p1 = "--" + prefix + "." + p2 = prefix + "." parser.add_argument( - p1+'v-field', default='', - help=('dataset field in the data file')) + p1 + "v-field", default="", help=("dataset field in the data file") + ) parser.add_argument( - p1+'seg-part-idx', dest=(p2+'seg_idx'), default=1, type=int, - help=('test part index')) + p1 + "seg-part-idx", + dest=(p2 + "seg_idx"), + default=1, + type=int, + help=("test part index"), + ) parser.add_argument( - p1+'num-seg-parts', default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - + p1 + "num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + add_argparse_args = add_class_args diff --git a/hyperion/helpers/multi_test_trial_data_reader.py b/hyperion/helpers/multi_test_trial_data_reader.py index 0d650696..57355cd0 100644 --- a/hyperion/helpers/multi_test_trial_data_reader.py +++ b/hyperion/helpers/multi_test_trial_data_reader.py @@ -15,15 +15,27 @@ from ..utils import TrialNdx, TrialKey, Utt2Info from ..transforms import TransformList + class MultiTestTrialDataReader(object): """ Loads Ndx, enroll file and x-vectors to evaluate PLDA. """ - def __init__(self, v_file, ndx_file, enroll_file, test_file, test_subseg2orig_file, - preproc, tlist_sep=' ', - model_idx=1, num_model_parts=1, seg_idx=1, num_seg_parts=1, - eval_set='enroll-test'): + def __init__( + self, + v_file, + ndx_file, + enroll_file, + test_file, + test_subseg2orig_file, + preproc, + tlist_sep=" ", + model_idx=1, + num_model_parts=1, + seg_idx=1, + num_seg_parts=1, + eval_set="enroll-test", + ): self.r = DRF.create(v_file) self.preproc = preproc @@ -40,76 +52,92 @@ def __init__(self, v_file, ndx_file, enroll_file, test_file, test_subseg2orig_fi ndx = TrialKey.load(ndx_file).to_ndx() subseg2orig = Utt2Info.load(test_subseg2orig_file, sep=tlist_sep) - + ndx, enroll = TrialNdx.parse_eval_set(ndx, enroll, test, eval_set) if num_model_parts > 1 or num_seg_parts > 1: ndx = TrialNdx.split(model_idx, num_model_parts, seg_idx, num_seg_parts) enroll = enroll.filter_info(ndx.model_set) - subseg2orig =subseg2orig.filter_info(ndx.seg_set) + subseg2orig = subseg2orig.filter_info(ndx.seg_set) self.enroll = enroll self.ndx = ndx self.subseg2orig = subseg2orig - def read(self): x_e = self.r.read(self.enroll.key, squeeze=True) x_t = self.r.read(self.subseg2orig.key, squeeze=True) - + if self.preproc is not None: x_e = self.preproc.predict(x_e) x_t = self.preproc.predict(x_t) return x_e, x_t, self.enroll.info, self.ndx, self.subseg2orig.info - - @staticmethod def filter_args(**kwargs): - valid_args = ('tlist_sep', - 'model_idx','num_model_parts', - 'seg_idx', 'num_seg_parts', - 'eval_set') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + valid_args = ( + "tlist_sep", + "model_idx", + "num_model_parts", + "seg_idx", + "num_seg_parts", + "eval_set", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '.' - p2 = prefix + '.' + p1 = "--" + prefix + "." + p2 = prefix + "." parser.add_argument( - p1+'tlist-sep', default=' ', - help=('trial lists field separator')) + p1 + "tlist-sep", default=" ", help=("trial lists field separator") + ) # parser.add_argument(p1+'v-field', dest=(p2+'v_field'), default='', # help=('dataset field in the data file')) parser.add_argument( - p1+'model-part-idx', - dest=(p2+'model_idx'), default=1, type=int, - help=('model part index')) - parser.add_argument( - p1+'num-model-parts', default=1, type=int, - help=('number of parts in which we divide the model' - 'list to run evaluation in parallel')) + p1 + "model-part-idx", + dest=(p2 + "model_idx"), + default=1, + type=int, + help=("model part index"), + ) parser.add_argument( - p1+'seg-part-idx', dest=(p2+'seg_idx'), default=1, type=int, - help=('test part index')) + p1 + "num-model-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the model" + "list to run evaluation in parallel" + ), + ) parser.add_argument( - p1+'num-seg-parts', default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - + p1 + "seg-part-idx", + dest=(p2 + "seg_idx"), + default=1, + type=int, + help=("test part index"), + ) parser.add_argument( - p1+'eval-set', type=str.lower, - default='enroll-test', - choices=['enroll-test','enroll-coh','coh-test','coh-coh'], - help=('evaluation subset')) + p1 + "num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + parser.add_argument( + p1 + "eval-set", + type=str.lower, + default="enroll-test", + choices=["enroll-test", "enroll-coh", "coh-test", "coh-coh"], + help=("evaluation subset"), + ) add_argparse_args = add_class_args - diff --git a/hyperion/helpers/multi_test_trial_data_reader_v2.py b/hyperion/helpers/multi_test_trial_data_reader_v2.py index de7d71f1..306f75ae 100644 --- a/hyperion/helpers/multi_test_trial_data_reader_v2.py +++ b/hyperion/helpers/multi_test_trial_data_reader_v2.py @@ -15,15 +15,27 @@ from ..utils import Utt2Info, TrialNdx, TrialKey from ..transforms import TransformList + class MultiTestTrialDataReaderV2(object): """ Loads Ndx, enroll file and x-vectors to evaluate PLDA. """ - def __init__(self, enroll_v_file, test_v_file, ndx_file, enroll_file, test_file, - preproc=None, tlist_sep=' ', - model_idx=1, num_model_parts=1, seg_idx=1, num_seg_parts=1, - eval_set='enroll-test'): + def __init__( + self, + enroll_v_file, + test_v_file, + ndx_file, + enroll_file, + test_file, + preproc=None, + tlist_sep=" ", + model_idx=1, + num_model_parts=1, + seg_idx=1, + num_seg_parts=1, + eval_set="enroll-test", + ): self.r_e = DRF.create(enroll_v_file) self.r_t = DRF.create(test_v_file) @@ -48,63 +60,91 @@ def __init__(self, enroll_v_file, test_v_file, ndx_file, enroll_file, test_file, self.enroll = enroll self.ndx = ndx - def read(self): x_e = self.r_e.read(self.enroll.key, squeeze=True) x_t = self.r_t.read(self.ndx.seg_set, squeeze=False) orig_seg = [] - for i,x_ti in enumerate(x_t): - orig_seg.append(np.asarray([i]*x_ti.shape[0], dtype=np.int)) + for i, x_ti in enumerate(x_t): + orig_seg.append(np.asarray([i] * x_ti.shape[0], dtype=np.int)) x_t = np.concatenate(tuple(x_t), axis=0) orig_seg = np.concatenate(tuple(orig_seg), axis=0) - + if self.preproc is not None: x_e = self.preproc.predict(x_e) x_t = self.preproc.predict(x_t) return x_e, x_t, self.enroll.info, self.ndx, orig_seg - - @staticmethod def filter_args(prefix=None, **kwargs): if prefix is None: - p = '' + p = "" else: - p = prefix + '_' - valid_args = ('tlist_sep', - 'model_idx','num_model_parts', - 'seg_idx', 'num_seg_parts', - 'eval_set') - return dict((k, kwargs[p+k]) - for k in valid_args if p+k in kwargs) - - + p = prefix + "_" + valid_args = ( + "tlist_sep", + "model_idx", + "num_model_parts", + "seg_idx", + "num_seg_parts", + "eval_set", + ) + return dict((k, kwargs[p + k]) for k in valid_args if p + k in kwargs) + @staticmethod def add_argparse_args(parser, prefix=None): if prefix is None: - p1 = '--' - p2 = '' + p1 = "--" + p2 = "" else: - p1 = '--' + prefix + '-' - p2 = prefix + '_' - parser.add_argument(p1+'tlist-sep', dest=(p2+'tlist_sep'), default=' ', - help=('trial lists field separator')) - - parser.add_argument(p1+'model-part-idx', dest=(p2+'model_idx'), default=1, type=int, - help=('model part index')) - parser.add_argument(p1+'num-model-parts', default=1, type=int, - help=('number of parts in which we divide the model' - 'list to run evaluation in parallel')) - parser.add_argument(p1+'seg-part-idx', dest=(p2+'seg_idx'), default=1, type=int, - help=('test part index')) - parser.add_argument(p1+'num-seg-parts', default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - - parser.add_argument(p1+'eval-set', type=str.lower, - default='enroll-test', - choices=['enroll-test','enroll-coh','coh-test','coh-coh'], - help=('evaluation subset')) + p1 = "--" + prefix + "-" + p2 = prefix + "_" + parser.add_argument( + p1 + "tlist-sep", + dest=(p2 + "tlist_sep"), + default=" ", + help=("trial lists field separator"), + ) + + parser.add_argument( + p1 + "model-part-idx", + dest=(p2 + "model_idx"), + default=1, + type=int, + help=("model part index"), + ) + parser.add_argument( + p1 + "num-model-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the model" + "list to run evaluation in parallel" + ), + ) + parser.add_argument( + p1 + "seg-part-idx", + dest=(p2 + "seg_idx"), + default=1, + type=int, + help=("test part index"), + ) + parser.add_argument( + p1 + "num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + + parser.add_argument( + p1 + "eval-set", + type=str.lower, + default="enroll-test", + choices=["enroll-test", "enroll-coh", "coh-test", "coh-coh"], + help=("evaluation subset"), + ) diff --git a/hyperion/helpers/plda_factory.py b/hyperion/helpers/plda_factory.py index 18c6ce03..b9c2ec60 100644 --- a/hyperion/helpers/plda_factory.py +++ b/hyperion/helpers/plda_factory.py @@ -7,150 +7,207 @@ from ..pdfs.plda import FRPLDA, SPLDA, PLDA + class PLDAFactory(object): """Class to create PLDA objects.""" - + @staticmethod - def create_plda(plda_type, y_dim=None, z_dim=None, fullcov_W=True, - update_mu=True, update_V=True, update_U=True, - update_B=True, update_W=True, update_D=True, - floor_iD=1e-5, - name='plda', **kwargs): - - if plda_type == 'frplda': - return FRPLDA(fullcov_W=fullcov_W, - update_mu=update_mu, update_B=update_B, - update_W=update_W, name=name, **kwargs) - if plda_type == 'splda': - return SPLDA(y_dim=y_dim, fullcov_W=fullcov_W, - update_mu=update_mu, update_V=update_V, - update_W=update_W, name=name, **kwargs) - - if plda_type == 'plda': - return PLDA(y_dim=y_dim, z_dim=z_dim, floor_iD=floor_iD, - update_mu=update_mu, update_V=update_V, - update_U=update_U, update_D=update_D, - name=name, **kwargs) - - - + def create_plda( + plda_type, + y_dim=None, + z_dim=None, + fullcov_W=True, + update_mu=True, + update_V=True, + update_U=True, + update_B=True, + update_W=True, + update_D=True, + floor_iD=1e-5, + name="plda", + **kwargs + ): + + if plda_type == "frplda": + return FRPLDA( + fullcov_W=fullcov_W, + update_mu=update_mu, + update_B=update_B, + update_W=update_W, + name=name, + **kwargs + ) + if plda_type == "splda": + return SPLDA( + y_dim=y_dim, + fullcov_W=fullcov_W, + update_mu=update_mu, + update_V=update_V, + update_W=update_W, + name=name, + **kwargs + ) + + if plda_type == "plda": + return PLDA( + y_dim=y_dim, + z_dim=z_dim, + floor_iD=floor_iD, + update_mu=update_mu, + update_V=update_V, + update_U=update_U, + update_D=update_D, + name=name, + **kwargs + ) + @staticmethod def load_plda(plda_type, model_file): - if plda_type == 'frplda': + if plda_type == "frplda": return FRPLDA.load(model_file) - elif plda_type == 'splda': + elif plda_type == "splda": return SPLDA.load(model_file) - elif plda_type == 'plda': + elif plda_type == "plda": return PLDA.load(model_file) - @staticmethod def filter_train_args(prefix=None, **kwargs): - valid_args = ('plda_type', 'y_dim', 'z_dim', - 'diag_W', 'no_update_mu', 'no_update_V', 'no_update_U', - 'no_update_B', 'no_update_W', 'no_update_D', 'floor_iD', - 'epochs', 'ml_md', 'md_epochs', 'name') - d = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - neg_args1 = ('diag_W', 'no_update_mu', 'no_update_V', 'no_update_U', - 'no_update_B', 'no_update_W', 'no_update_D') - neg_args2 = ('fullcov_W', 'update_mu', 'update_V', 'update_U', - 'update_B', 'update_W', 'update_D') - - for a,b in zip(ne_args1, neg_args2): + valid_args = ( + "plda_type", + "y_dim", + "z_dim", + "diag_W", + "no_update_mu", + "no_update_V", + "no_update_U", + "no_update_B", + "no_update_W", + "no_update_D", + "floor_iD", + "epochs", + "ml_md", + "md_epochs", + "name", + ) + d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + neg_args1 = ( + "diag_W", + "no_update_mu", + "no_update_V", + "no_update_U", + "no_update_B", + "no_update_W", + "no_update_D", + ) + neg_args2 = ( + "fullcov_W", + "update_mu", + "update_V", + "update_U", + "update_B", + "update_W", + "update_D", + ) + + for a, b in zip(ne_args1, neg_args2): d[b] = not d[a] del d[a] return d - - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'plda-type', - default='splda', - choices=['frplda', 'splda', 'plda'], - help='PLDA type') - - parser.add_argument(p1+'y-dim', type=int, - default=150, - help='num. of eigenvoices') - parser.add_argument(p1+'z-dim', type=int, - default=400, - help='num. of eigenchannels') - - parser.add_argument(p1+'diag-W', - default=False, action='store_false', - help='use diagonal covariance W') - parser.add_argument(p1+'no-update-mu', - default=False, action='store_true', - help='not update mu') - parser.add_argument(p1+'no-update-V', - default=False, action='store_true', - help='not update V') - parser.add_argument(p1+'no-update-U', - default=False, action='store_true', - help='not update U') - - parser.add_argument(p1+'no-update-B', - default=False, action='store_true', - help='not update B') - parser.add_argument(p1+'no-update-W', - default=False, action='store_true', - help='not update W') - parser.add_argument(p1+'no-update-D', - default=False, action='store_true', - help='not update D') - parser.add_argument(p1+'floor-iD', type=float, - default=1e-5, - help='floor for inverse of D matrix') - - - parser.add_argument(p1+'epochs',type=int, - default=40, - help='num. of epochs') - parser.add_argument(p1+'ml-md', - default='ml+md', - choices=['ml+md', 'ml', 'md'], - help=('optimization type')) - - parser.add_argument('--md-epochs', default=None, - type=int, nargs = '+', - help=('epochs in which we do MD, if None we do it in all the epochs')) - - parser.add_argument(p1+'name', - default='plda', - help='model name') - - + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "plda-type", + default="splda", + choices=["frplda", "splda", "plda"], + help="PLDA type", + ) + + parser.add_argument( + p1 + "y-dim", type=int, default=150, help="num. of eigenvoices" + ) + parser.add_argument( + p1 + "z-dim", type=int, default=400, help="num. of eigenchannels" + ) + + parser.add_argument( + p1 + "diag-W", + default=False, + action="store_false", + help="use diagonal covariance W", + ) + parser.add_argument( + p1 + "no-update-mu", + default=False, + action="store_true", + help="not update mu", + ) + parser.add_argument( + p1 + "no-update-V", default=False, action="store_true", help="not update V" + ) + parser.add_argument( + p1 + "no-update-U", default=False, action="store_true", help="not update U" + ) + + parser.add_argument( + p1 + "no-update-B", default=False, action="store_true", help="not update B" + ) + parser.add_argument( + p1 + "no-update-W", default=False, action="store_true", help="not update W" + ) + parser.add_argument( + p1 + "no-update-D", default=False, action="store_true", help="not update D" + ) + parser.add_argument( + p1 + "floor-iD", + type=float, + default=1e-5, + help="floor for inverse of D matrix", + ) + + parser.add_argument(p1 + "epochs", type=int, default=40, help="num. of epochs") + parser.add_argument( + p1 + "ml-md", + default="ml+md", + choices=["ml+md", "ml", "md"], + help=("optimization type"), + ) + + parser.add_argument( + "--md-epochs", + default=None, + type=int, + nargs="+", + help=("epochs in which we do MD, if None we do it in all the epochs"), + ) + + parser.add_argument(p1 + "name", default="plda", help="model name") @staticmethod def filter_eval_args(prefix=None, **kwargs): - valid_args = ('plda_type', 'model_file') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("plda_type", "model_file") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_eval_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'plda-type', - default='splda', - choices=['frplda', 'splda', 'plda'], - help=('PLDA type')) - parser.add_argument(p1+'model-file', required=True, - help=('model file')) - - - + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "plda-type", + default="splda", + choices=["frplda", "splda", "plda"], + help=("PLDA type"), + ) + parser.add_argument(p1 + "model-file", required=True, help=("model file")) + add_argparse_train_args = add_class_args add_argparse_eval_args = add_eval_args diff --git a/hyperion/helpers/tracking_data_reader.py b/hyperion/helpers/tracking_data_reader.py index b54ad04f..6dfc9a19 100644 --- a/hyperion/helpers/tracking_data_reader.py +++ b/hyperion/helpers/tracking_data_reader.py @@ -2,10 +2,7 @@ Copyright 2018 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -<<<<<<< HEAD -======= ->>>>>>> 75d3f58c01adf8548745c91cee7de3b7390c1b2e import sys import os import argparse @@ -18,14 +15,25 @@ from ..utils import Utt2Info, TrialNdx, ExtSegmentList from ..transforms import TransformList + class TrackingDataReader(object): """ Loads ndx, enroll file and x-vectors to do speaker tracking with PLDA """ - def __init__(self, v_file, ndx_file, enroll_file, segments_file, - preproc, tlist_sep=' ', - model_idx=1, num_model_parts=1, seg_idx=1, num_seg_parts=1): + def __init__( + self, + v_file, + ndx_file, + enroll_file, + segments_file, + preproc, + tlist_sep=" ", + model_idx=1, + num_model_parts=1, + seg_idx=1, + num_seg_parts=1, + ): self.r = DRF.create(v_file) self.preproc = preproc @@ -40,13 +48,11 @@ def __init__(self, v_file, ndx_file, enroll_file, segments_file, ndx = TrialNdx.split(model_idx, num_model_parts, seg_idx, num_seg_parts) enroll = enroll.filter_info(ndx.model_set) segments = segments.filter(ndx.seg_set) - + self.enroll = enroll self.ndx = ndx self.segments = segments - - def read(self, key=None): if key is None: enroll, ndx_seg, segments = self._read_all_utts() @@ -55,18 +61,16 @@ def read(self, key=None): x_e = self.r.read(enroll.key, squeeze=True) x_t = self.r.read(ndx_seg.seg_set, squeeze=True) - + if self.preproc is not None: x_e = self.preproc.predict(x_e) x_t = self.preproc.predict(x_t) return x_e, x_t, enroll.info, ndx_seg, segments - def _read_all_utts(self): ndx_seg = self.ndx.apply_segmentation_to_test(self.segments) return self.enroll, ndx_seg, self.segments - def _read_single_utt(self, key): ndx = self.ndx.filter(self.ndx.model_set, [key]) @@ -75,40 +79,50 @@ def _read_single_utt(self, key): segments = self.segments.filter([key]) return enroll, ndx_seg, segments - - @staticmethod def filter_args(**kwargs): - valid_args = ('tlist_sep', - 'model_idx','num_model_parts', - 'seg_idx', 'num_seg_parts') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ( + "tlist_sep", + "model_idx", + "num_model_parts", + "seg_idx", + "num_seg_parts", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1+'tlist-sep', default=' ', - help=('trial lists field separator')) + p1 + "tlist-sep", default=" ", help=("trial lists field separator") + ) parser.add_argument( - p1+'model-part-idx', default=1, type=int, - help=('model part index')) + p1 + "model-part-idx", default=1, type=int, help=("model part index") + ) parser.add_argument( - p1+'num-model-parts', default=1, type=int, - help=('number of parts in which we divide the model' - 'list to run evaluation in parallel')) + p1 + "num-model-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the model" + "list to run evaluation in parallel" + ), + ) parser.add_argument( - p1+'seg-part-idx', default=1, type=int, - help=('test part index')) + p1 + "seg-part-idx", default=1, type=int, help=("test part index") + ) parser.add_argument( - p1+'num-seg-parts', default=1, type=int, - help=('number of parts in which we divide the test list ' - 'to run evaluation in parallel')) - + p1 + "num-seg-parts", + default=1, + type=int, + help=( + "number of parts in which we divide the test list " + "to run evaluation in parallel" + ), + ) + add_argparse_args = add_class_args diff --git a/hyperion/helpers/vector_class_reader.py b/hyperion/helpers/vector_class_reader.py index 156941f8..4f893aac 100644 --- a/hyperion/helpers/vector_class_reader.py +++ b/hyperion/helpers/vector_class_reader.py @@ -18,14 +18,25 @@ class VectorClassReader(object): - """Class to load data to train LDA, PLDA, PDDA. - """ - - def __init__(self, v_file, key_file, preproc=None, vlist_sep=' ', - class2int_file=None, - min_spc=1, max_spc=None, spc_pruning_mode='random', - csplit_min_spc=1, csplit_max_spc=None, csplit_mode='random', - csplit_overlap=0, vcr_seed=1024, csplit_once=True): + """Class to load data to train LDA, PLDA, PDDA.""" + + def __init__( + self, + v_file, + key_file, + preproc=None, + vlist_sep=" ", + class2int_file=None, + min_spc=1, + max_spc=None, + spc_pruning_mode="random", + csplit_min_spc=1, + csplit_max_spc=None, + csplit_mode="random", + csplit_overlap=0, + vcr_seed=1024, + csplit_once=True, + ): self.r = DRF.create(v_file) self.u2c = Utt2Info.load(key_file, sep=vlist_sep) @@ -33,9 +44,11 @@ def __init__(self, v_file, key_file, preproc=None, vlist_sep=' ', self.map_class2int = None if class2int_file is not None: - with open(class2int_file, 'r') as f: - self.map_class2int = {v[0]:int(v[1]) for v in [ line.rstrip().split() for line in f ]} - + with open(class2int_file, "r") as f: + self.map_class2int = { + v[0]: int(v[1]) for v in [line.rstrip().split() for line in f] + } + self.rng = np.random.RandomState(vcr_seed) self.csplit_max_spc = csplit_max_spc self.csplit_min_spc = csplit_min_spc @@ -43,69 +56,80 @@ def __init__(self, v_file, key_file, preproc=None, vlist_sep=' ', self.csplit_overlap = csplit_overlap self.csplit_once = csplit_once self._samples_per_class = None - self.u2c = self._filter_by_spc(self.u2c, min_spc, max_spc, spc_pruning_mode, self.rng) + self.u2c = self._filter_by_spc( + self.u2c, min_spc, max_spc, spc_pruning_mode, self.rng + ) if csplit_once: - self.u2c = self._split_classes(self.u2c, self.csplit_min_spc, self.csplit_max_spc, - self.csplit_mode, self.csplit_overlap, self.rng) - + self.u2c = self._split_classes( + self.u2c, + self.csplit_min_spc, + self.csplit_max_spc, + self.csplit_mode, + self.csplit_overlap, + self.rng, + ) - def read(self, return_3d=False, max_length=0): if self.csplit_once: u2c = self.u2c else: - u2c = self._split_classes(self.u2c, self.csplit_min_spc, self.csplit_max_spc, - self.csplit_mode, self.csplit_overlap, self.rng) - + u2c = self._split_classes( + self.u2c, + self.csplit_min_spc, + self.csplit_max_spc, + self.csplit_mode, + self.csplit_overlap, + self.rng, + ) + x = self.r.read(u2c.key, squeeze=True) if self.preproc is not None: x = self.preproc.predict(x) if self.map_class2int is None: - _, class_ids=np.unique(u2c.info, return_inverse=True) + _, class_ids = np.unique(u2c.info, return_inverse=True) else: - class_ids = np.array([ self.map_class2int[k] for k in u2c.info ], dtype=int) + class_ids = np.array([self.map_class2int[k] for k in u2c.info], dtype=int) if return_3d: x, sample_weight = to3D_by_class(x, class_ids, max_length) return x, sample_weight return x, class_ids - - @property def class_names(self): if self.map_class2int is None: return np.unique(self.u2c.info) else: - map_int2class = {k:v for v,k in self.map_class2int.items()} - classes = [ map_int2class[i] for i in range(len(map_int2class))] + map_int2class = {k: v for v, k in self.map_class2int.items()} + classes = [map_int2class[i] for i in range(len(map_int2class))] return np.asarray(classes) - - + @property def samples_per_class(self): if self._samples_per_class is None: if self.csplit_once: u2c = self.u2c else: - u2c = self._split_classes(self.u2c, self.csplit_min_spc, self.csplit_max_spc, - self.csplit_mode, self.csplit_overlap, self.rng) - _, self._samples_per_class=np.unique(u2c.info, return_counts=True) + u2c = self._split_classes( + self.u2c, + self.csplit_min_spc, + self.csplit_max_spc, + self.csplit_mode, + self.csplit_overlap, + self.rng, + ) + _, self._samples_per_class = np.unique(u2c.info, return_counts=True) return self._samples_per_class - - @property def max_samples_per_class(self): num_spc = self.samples_per_class return np.max(num_spc) - - @staticmethod - def _filter_by_spc(u2c, min_spc=1, max_spc=None, spc_pruning_mode='last', rng=None): - if min_spc <= 1 and max_spc==None: + def _filter_by_spc(u2c, min_spc=1, max_spc=None, spc_pruning_mode="last", rng=None): + if min_spc <= 1 and max_spc == None: return u2c if min_spc > 1: @@ -114,158 +138,177 @@ def _filter_by_spc(u2c, min_spc=1, max_spc=None, spc_pruning_mode='last', rng=No u2c = u2c.filter_info(filter_key) if max_spc is not None: - classes, class_ids, num_spc=np.unique( - u2c.info, return_inverse=True, return_counts=True) - + classes, class_ids, num_spc = np.unique( + u2c.info, return_inverse=True, return_counts=True + ) + if np.all(num_spc <= max_spc): return u2c f = np.ones_like(class_ids, dtype=bool) - for i in range(np.max(class_ids)+1): + for i in range(np.max(class_ids) + 1): if num_spc[i] > max_spc: indx = np.where(class_ids == i)[0] num_reject = len(indx) - max_spc - if spc_pruning_mode == 'random': - #indx = rng.permutation(indx) - #indx = indx[-num_reject:] + if spc_pruning_mode == "random": + # indx = rng.permutation(indx) + # indx = indx[-num_reject:] indx = rng.choice(indx, size=num_reject, replace=False) - if spc_pruning_mode == 'last': + if spc_pruning_mode == "last": indx = indx[-num_reject:] - if spc_pruning_mode == 'first': + if spc_pruning_mode == "first": indx = indx[:num_reject] f[indx] = False - if np.any(f==False): + if np.any(f == False): u2c = Utt2Info.create(u2c.key[f], u2c.info[f]) - - return u2c + return u2c - @staticmethod - def _split_classes(u2c, min_spc, max_spc, mode='sequential', overlap=0, rng=None): + def _split_classes(u2c, min_spc, max_spc, mode="sequential", overlap=0, rng=None): if max_spc is None: return u2c - if mode == 'random_1part': - return VectorClassReader._filter_by_spc(u2c, min_spc, max_spc, 'random', rng) - - _, class_ids, num_spc = np.unique(u2c.info, return_inverse=True, return_counts=True) + if mode == "random_1part": + return VectorClassReader._filter_by_spc( + u2c, min_spc, max_spc, "random", rng + ) + + _, class_ids, num_spc = np.unique( + u2c.info, return_inverse=True, return_counts=True + ) if np.all(num_spc <= max_spc): return VectorClassReader._filter_by_spc(u2c, min_spc) - num_classes = np.max(class_ids)+1 + num_classes = np.max(class_ids) + 1 - shift = max_spc-overlap - new_indx = np.zeros(max_spc*int(np.max(num_spc)*num_classes/shift+1), dtype=int) + shift = max_spc - overlap + new_indx = np.zeros( + max_spc * int(np.max(num_spc) * num_classes / shift + 1), dtype=int + ) new_class_ids = np.zeros_like(new_indx) - + j = 0 new_i = 0 for i in range(num_classes): indx_i = np.where(class_ids == i)[0] if num_spc[i] > max_spc: - num_subclass = int(np.ceil((num_spc[i] - max_spc)/shift + 1)) - if mode == 'sequential': + num_subclass = int(np.ceil((num_spc[i] - max_spc) / shift + 1)) + if mode == "sequential": l = 0 - for k in range(num_subclass-1): - new_indx[j:j+max_spc] = indx_i[l:l+max_spc] - new_class_ids[j:j+max_spc] = new_i + for k in range(num_subclass - 1): + new_indx[j : j + max_spc] = indx_i[l : l + max_spc] + new_class_ids[j : j + max_spc] = new_i l += shift j += max_spc new_i += 1 - n = num_spc[i] - (num_subclass-1)*shift - new_indx[j:j+n] = indx_i[l:l+n] - new_class_ids[j:j+n] = new_i + n = num_spc[i] - (num_subclass - 1) * shift + new_indx[j : j + n] = indx_i[l : l + n] + new_class_ids[j : j + n] = new_i j += n new_i += 1 - if mode == 'random': + if mode == "random": for k in range(num_subclass): - #indx[j:j+max_spc] = rng.permutation(indx_i)[:max_spc] - new_indx[j:j+max_spc] = rng.choice( - indx_i, size=max_spc, replace=False) - new_class_ids[j:j+max_spc] = new_i + # indx[j:j+max_spc] = rng.permutation(indx_i)[:max_spc] + new_indx[j : j + max_spc] = rng.choice( + indx_i, size=max_spc, replace=False + ) + new_class_ids[j : j + max_spc] = new_i j += max_spc new_i += 1 else: - new_indx[j:j+num_spc[i]] = indx_i - new_class_ids[j:j+num_spc[i]] = new_i + new_indx[j : j + num_spc[i]] = indx_i + new_class_ids[j : j + num_spc[i]] = new_i new_i += 1 j += num_spc[i] new_indx = new_indx[:j] - new_class_ids = new_class_ids[:j].astype('U') + new_class_ids = new_class_ids[:j].astype("U") key = u2c.key[new_indx] u2c = Utt2Info.create(key, new_class_ids) - + return VectorClassReader._filter_by_spc(u2c, min_spc) - @staticmethod def filter_args(**kwargs): - valid_args = ('vlist_sep', 'class2int_file', - 'min_spc', 'max_spc', 'spc_pruning_mode', - 'csplit_min_spc', 'csplit_max_spc', - 'csplit_mode', 'csplit_overlap', - 'csplit_once','vcr_seed') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + valid_args = ( + "vlist_sep", + "class2int_file", + "min_spc", + "max_spc", + "spc_pruning_mode", + "csplit_min_spc", + "csplit_max_spc", + "csplit_mode", + "csplit_overlap", + "csplit_once", + "vcr_seed", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' + p1 = "--" + prefix + "." parser.add_argument( - p1+'vlist-sep', default=' ', - help=('utt2class file field separator')) + p1 + "vlist-sep", default=" ", help=("utt2class file field separator") + ) parser.add_argument( - p1+'class2int-file', default=None, - help=('file that maps class string to integer')) + p1 + "class2int-file", + default=None, + help=("file that maps class string to integer"), + ) parser.add_argument( - p1+'min-spc', type=int, - default=1, - help=('minimum samples per class')) + p1 + "min-spc", type=int, default=1, help=("minimum samples per class") + ) parser.add_argument( - p1+'max-spc', type=int, - default=None, - help=('maximum samples per class')) + p1 + "max-spc", type=int, default=None, help=("maximum samples per class") + ) parser.add_argument( - p1+'spc-pruning-mode', - default='random', - choices=['random', 'first', 'last'], - help=('vector pruning method when spc > max-spc')) + p1 + "spc-pruning-mode", + default="random", + choices=["random", "first", "last"], + help=("vector pruning method when spc > max-spc"), + ) parser.add_argument( - p1+'csplit-min-spc', + p1 + "csplit-min-spc", type=int, default=None, - help=('minimum samples per class when doing class spliting')) + help=("minimum samples per class when doing class spliting"), + ) parser.add_argument( - p1+'csplit-max-spc', type=int, + p1 + "csplit-max-spc", + type=int, default=None, - help=('split one class into subclasses with ' - 'spc <= csplit-max-spc')) + help=("split one class into subclasses with " "spc <= csplit-max-spc"), + ) parser.add_argument( - p1+'csplit-mode', - default='random', type=str.lower, - choices = ['sequential', 'random', 'random_1subclass'], - help=('class splitting mode')) + p1 + "csplit-mode", + default="random", + type=str.lower, + choices=["sequential", "random", "random_1subclass"], + help=("class splitting mode"), + ) parser.add_argument( - p1+'csplit-overlap', type=float, - default=0, help=('overlap between subclasses')) + p1 + "csplit-overlap", + type=float, + default=0, + help=("overlap between subclasses"), + ) parser.add_argument( - p1+'no-csplit-once', - default=True, action='store_false', - help=('class spliting done in each iteration')) + p1 + "no-csplit-once", + default=True, + action="store_false", + help=("class spliting done in each iteration"), + ) parser.add_argument( - p1+'vcr-seed', type=int, - default=1024, help=('seed for rng')) - + p1 + "vcr-seed", type=int, default=1024, help=("seed for rng") + ) - add_argparse_args = add_class_args - + add_argparse_args = add_class_args diff --git a/hyperion/helpers/vector_reader.py b/hyperion/helpers/vector_reader.py index 776dee00..3f0fa1d2 100644 --- a/hyperion/helpers/vector_reader.py +++ b/hyperion/helpers/vector_reader.py @@ -17,16 +17,14 @@ class VectorReader(object): - """Class to load data to train PCA, centering, whitening. - """ - def __init__(self, v_file, key_file, preproc=None, vlist_sep=' '): + """Class to load data to train PCA, centering, whitening.""" + + def __init__(self, v_file, key_file, preproc=None, vlist_sep=" "): self.r = DRF.create(v_file) self.scp = SCPList.load(key_file, sep=vlist_sep) self.preproc = preproc - - def read(self): try: x = self.r.read(self.scp.key, squeeze=True) @@ -37,35 +35,28 @@ def read(self): if self.preproc is not None: for i in range(len(x)): if x[i].ndim == 1: - x[i] = x[i][None,:] + x[i] = x[i][None, :] x[i] = self.preproc.predict(x[i]) - - return x + return x @staticmethod def filter_args(**kwargs): - valid_args = ('vlist_sep') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = "vlist_sep" + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--vlist-sep', default=' ', - help=('utterance file field separator')) - + "--vlist-sep", default=" ", help=("utterance file field separator") + ) + if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='vector reader params') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='vector reader params') add_argparse_args = add_class_args diff --git a/hyperion/hyp_defs.py b/hyperion/hyp_defs.py index 540e25ee..3c994902 100644 --- a/hyperion/hyp_defs.py +++ b/hyperion/hyp_defs.py @@ -6,9 +6,9 @@ import numpy as np -_FLOAT_CPU = 'float64' -_FLOAT_KERAS = 'float32' -_FLOAT_SAVE = 'float32' +_FLOAT_CPU = "float64" +_FLOAT_KERAS = "float32" +_FLOAT_SAVE = "float32" def float_cpu(): @@ -38,11 +38,13 @@ def set_float_save(float_save): _FLOAT_SAVE = float_save -logging_levels = { 0: logging.WARN, 1: logging.INFO, 2: logging.DEBUG, 3: 5} - +logging_levels = {0: logging.WARN, 1: logging.INFO, 2: logging.DEBUG, 3: 5} + + def config_logger(verbose_level): logging_level = logging_levels[verbose_level] logging.basicConfig( level=logging_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s") + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) diff --git a/hyperion/hyp_model.py b/hyperion/hyp_model.py index 0d4d5211..0ffd2285 100644 --- a/hyperion/hyp_model.py +++ b/hyperion/hyp_model.py @@ -12,6 +12,7 @@ from .hyp_defs import float_save, float_cpu + class HypModel(object): __metaclass__ = ABCMeta @@ -19,59 +20,50 @@ def __init__(self, name=None, **kwargs): self.name = name self._is_init = False - def copy(self): return deepcopy(self) - @property def is_init(self): return self._is_init - def init_to_false(self): self._is_init = False - - + @abstractmethod def initialize(self): pass - @abstractmethod def fit(self, x, sample_weights=None, x_val=None, sample_weights_val=None): pass - @abstractmethod def fit_generator(self, x, x_val=None): pass - @abstractmethod def save(self, file_path): file_dir = os.path.dirname(file_path) - if not(os.path.isdir(file_dir)): + if not (os.path.isdir(file_dir)): os.makedirs(file_dir, exist_ok=True) - with h5py.File(file_path,'w') as f: + with h5py.File(file_path, "w") as f: config = self.to_json() - f.create_dataset('config', data=np.array(config, dtype='S')) + f.create_dataset("config", data=np.array(config, dtype="S")) self.save_params(f) - @abstractmethod def save_params(self, f): - assert True, 'save_params method not defined for %s' % (self.__class__.__name__) + assert True, "save_params method not defined for %s" % (self.__class__.__name__) - def _save_params_from_dict(self, f, params, dtypes=None): if dtypes is None: dtypes = dict((k, float_save()) for k in params) if self.name is None: - prefix = '' + prefix = "" else: - prefix = self.name + '/' + prefix = self.name + "/" for k, v in params.items(): if v is None: continue @@ -79,60 +71,53 @@ def _save_params_from_dict(self, f, params, dtypes=None): v = np.asarray(v) p_name = prefix + k f.create_dataset(p_name, data=v.astype(dtypes[k], copy=False)) - @classmethod def load_config(cls, file_path): try: - with h5py.File(file_path,'r') as f: - json_str = str(np.asarray(f['config']).astype('U')) + with h5py.File(file_path, "r") as f: + json_str = str(np.asarray(f["config"]).astype("U")) return cls.load_config_from_json(json_str) except: - with open(file_path,'r') as f: + with open(file_path, "r") as f: return cls.load_config_from_json(f.read()) - - @classmethod def load(cls, file_path): - with h5py.File(file_path,'r') as f: - json_str = str(np.asarray(f['config']).astype('U')) + with h5py.File(file_path, "r") as f: + json_str = str(np.asarray(f["config"]).astype("U")) config = cls.load_config_from_json(json_str) return cls.load_params(f, config) - @classmethod def load_params(cls, f, config): - return cls(name=config['name']) + return cls(name=config["name"]) - @staticmethod def _load_params_to_dict(f, name, params, dtypes=None): if dtypes is None: dtypes = dict((k, float_cpu()) for k in params) if name is None: - prefix = '' + prefix = "" else: - prefix = name + '/' + prefix = name + "/" param_dict = {} for k in params: p_name = prefix + k if p_name in f: - param_dict[k] = np.asarray(f[p_name]).astype(dtype=dtypes[k], copy=False) + param_dict[k] = np.asarray(f[p_name]).astype( + dtype=dtypes[k], copy=False + ) else: param_dict[k] = None return param_dict - @abstractmethod def get_config(self): - config = { - 'class_name': self.__class__.__name__, - 'name': self.name} + config = {"class_name": self.__class__.__name__, "name": self.name} return config - def to_json(self, **kwargs): # Piece of code borrowed from keras def get_json_type(obj): @@ -143,12 +128,11 @@ def get_json_type(obj): # if obj is a python 'type' if type(obj).__name__ == type.__name__: return obj.__name__ - - raise TypeError('Not JSON Serializable:', obj) - config=self.get_config() - return json.dumps(config, default=get_json_type, **kwargs) + raise TypeError("Not JSON Serializable:", obj) + config = self.get_config() + return json.dumps(config, default=get_json_type, **kwargs) @staticmethod def load_config_from_json(json_str): diff --git a/hyperion/io/__init__.py b/hyperion/io/__init__.py index 5948bfbc..5ddf131b 100644 --- a/hyperion/io/__init__.py +++ b/hyperion/io/__init__.py @@ -17,7 +17,10 @@ from .audio_reader import * from .audio_writer import * -from .packed_audio_reader import SequentialPackedAudioReader, RandomAccessPackedAudioReader +from .packed_audio_reader import ( + SequentialPackedAudioReader, + RandomAccessPackedAudioReader, +) from .packed_audio_writer import PackedAudioWriter @@ -27,7 +30,4 @@ from .kaldi_data_reader import * - -#from .queues import * - - +# from .queues import * diff --git a/hyperion/io/ark_data_reader.py b/hyperion/io/ark_data_reader.py index 22fef23f..7f6ec350 100644 --- a/hyperion/io/ark_data_reader.py +++ b/hyperion/io/ark_data_reader.py @@ -16,16 +16,16 @@ class SequentialArkDataReader(SequentialDataReader): """Abstract base class to read Ark feature files in - sequential order. - - Attributes: - file_path: ark or scp file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + sequential order. + + Attributes: + file_path: ark or scp file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ def __init__(self, file_path, **kwargs): @@ -34,29 +34,24 @@ def __init__(self, file_path, **kwargs): self.lock = threading.Lock() self.cur_file = None - def close(self): """Closes input file.""" if self.f is not None: self.f.close() self.f = None - - def _seek(self, offset): """Moves the pointer of the input file. - + Args: offset: Byte where we want to put the pointer. """ cur_pos = self.f.tell() delta = offset - cur_pos self.f.seek(delta, 1) - - def _open_archive(self, file_path, offset=0): - """Opens the current file if it is not open and moves the + """Opens the current file if it is not open and moves the file pointer to a given position. Closes previous open Ark files. @@ -67,18 +62,16 @@ def _open_archive(self, file_path, offset=0): if self.f is None or file_path != self.cur_file: self.close() self.cur_file = file_path - self.f = open(file_path, 'rb') + self.f = open(file_path, "rb") if offset > 0: self._seek(offset) - - def read_num_rows(self, num_records=0, assert_same_dim=True): """Reads the number of rows in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads all the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -88,17 +81,14 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): Integer numpy array with num_records number of rows. """ keys, shapes = self.read_shapes(num_records, assert_same_dim) - num_rows = np.array([s[0] if len(s)==2 else 1 for s in shapes], - dtype=int) + num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - - def read_dims(self, num_records=0, assert_same_dim=True): """Reads the number of columns in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -109,70 +99,61 @@ def read_dims(self, num_records=0, assert_same_dim=True): """ keys, shapes = self.read_shapes(num_records, False) dims = np.array([s[-1] for s in shapes], dtype=int) - if assert_same_dim and len(dims)>0: - assert np.all(dims==dims[0]) + if assert_same_dim and len(dims) > 0: + assert np.all(dims == dims[0]) return keys, dims - - - class SequentialArkFileDataReader(SequentialArkDataReader): """Class to read feature matrices/vectors in - sequential order from a single Ark file. - - Attributes: - file_path: Ark file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + sequential order from a single Ark file. + + Attributes: + file_path: Ark file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ def __init__(self, file_path, **kwargs): super(SequentialArkFileDataReader, self).__init__( - file_path, permissive=False, **kwargs) + file_path, permissive=False, **kwargs + ) self._open_archive(self.file_path) self._eof = False self._keys = None if self.num_parts > 1: raise NotImplementedError( - 'Dataset splitting not available for %s' % - self.__class__.__name__) - + "Dataset splitting not available for %s" % self.__class__.__name__ + ) - def reset(self): """Puts the file pointer back to the begining of the file""" if self.f is not None: self.f.seek(0, 0) self._eof = False - - def eof(self): """Returns True when it reaches the end of the ark file.""" return self._eof or self.f is None - @property def keys(self): if self._keys is None: self.reset() self._keys, _ = self.read_shapes() self.reset() - + return self._keys - - def read_shapes(self, num_records=0, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -185,40 +166,37 @@ def read_shapes(self, num_records=0, assert_same_dim=True): shapes = [] count = 0 binary = False - while num_records==0 or count < num_records: + while num_records == 0 or count < num_records: key_i = read_token(self.f, binary) - if key_i == '': + if key_i == "": self._eof = True break binary = init_kaldi_input_stream(self.f) - shape_i = KaldiMatrix.read_shape( - self.f, binary, sequential_mode=True) + shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True) keys.append(key_i) shapes.append(shape_i) count += 1 - if assert_same_dim and len(shapes)>0: + if assert_same_dim and len(shapes) > 0: dims = np.array([s[-1] for s in shapes], dtype=int) assert np.all(dims == dims[0]) - - return keys, shapes + return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): """Reads next num_records feature matrices/vectors. - + Args: num_records: Number of feature matrices to read. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -226,107 +204,102 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): key: List of recording names. data: List of feature matrices/vectors or 3D/2D numpy array. """ - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) keys = [] data = [] count = 0 binary = False with self.lock: - while num_records==0 or count < num_records: + while num_records == 0 or count < num_records: key_i = read_token(self.f, binary) - if key_i == '': + if key_i == "": self._eof = True break - + row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows - + binary = init_kaldi_input_stream(self.f) data_i = KaldiMatrix.read( - self.f, binary, row_offset_i, num_rows_i, - sequential_mode=True).to_ndarray() + self.f, binary, row_offset_i, num_rows_i, sequential_mode=True + ).to_ndarray() assert num_rows_i == 0 or data_i.shape[0] == num_rows_i if self.transform is not None: data_i = self.transform.predict(data_i) - + keys.append(key_i) data.append(data_i) count += 1 if squeeze: data = self._squeeze(data) - - return keys, data - - + return keys, data class SequentialArkScriptDataReader(SequentialArkDataReader): """Class to read Ark feature files indexed by a scp file in - sequential order. - - Attributes: - file_path: scp file to read. - path_prefix: If input_spec is a scp file, it pre-appends - path_prefix string to the second column of - the scp file. This is useful when data - is read from a different directory of that - it was created. - scp_sep: Separator for scp files (default ' '). - transform: TransformList object, applies a transformation to the - features after reading them from disk. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + sequential order. + + Attributes: + file_path: scp file to read. + path_prefix: If input_spec is a scp file, it pre-appends + path_prefix string to the second column of + the scp file. This is useful when data + is read from a different directory of that + it was created. + scp_sep: Separator for scp files (default ' '). + transform: TransformList object, applies a transformation to the + features after reading them from disk. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ - - def __init__(self, file_path, path_prefix=None, scp_sep=' ', **kwargs): + + def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): super(SequentialArkScriptDataReader, self).__init__( - file_path, permissive=False, **kwargs) + file_path, permissive=False, **kwargs + ) self.scp = SCPList.load(self.file_path, sep=scp_sep) if self.num_parts > 1: - self.scp = self.scp.split(self.part_idx, self.num_parts, - group_by_key=self.split_by_key) - + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=self.split_by_key + ) + if path_prefix is not None: self.scp.add_prefix_to_filepath(path_prefix) - + self.cur_item = 0 - @property def keys(self): return self.scp.key - def reset(self): """Closes all the open Ark files and puts the read pointer pointing to the first element in the scp file.""" self.close() self.cur_item = 0 - - def eof(self): """Returns True when all the elements in the scp have been read.""" return self.cur_item == len(self.scp) - - def read_shapes(self, num_records=0, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -337,7 +310,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True): """ if num_records == 0: num_records = len(self.scp) - self.cur_item - + keys = [] shapes = [] for i in range(num_records): @@ -349,12 +322,10 @@ def read_shapes(self, num_records=0, assert_same_dim=True): self._open_archive(file_path, offset) binary = init_kaldi_input_stream(self.f) - shape_i = KaldiMatrix.read_shape( - self.f, binary, sequential_mode=True) + shape_i = KaldiMatrix.read_shape(self.f, binary, sequential_mode=True) + + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) - shape_i = self._apply_range_to_shape( - shape_i, row_offset_i, num_rows_i) - keys.append(key) shapes.append(shape_i) self.cur_item += 1 @@ -362,23 +333,20 @@ def read_shapes(self, num_records=0, assert_same_dim=True): if assert_same_dim: dims = np.array([s[-1] for s in shapes], dtype=int) assert np.all(dims == dims[0]) - - return keys, shapes - + return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): """Reads next num_records feature matrices/vectors. - + Args: num_records: Number of feature matrices to read. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -389,10 +357,12 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self.scp) - self.cur_item - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) keys = [] data = [] @@ -406,13 +376,14 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows row_offset_i, num_rows_i = self._combine_ranges( - range_spec, row_offset_i, num_rows_i) + range_spec, row_offset_i, num_rows_i + ) self._open_archive(file_path, offset) binary = init_kaldi_input_stream(self.f) data_i = KaldiMatrix.read( - self.f, binary, row_offset_i, num_rows_i, - sequential_mode=True).to_ndarray() + self.f, binary, row_offset_i, num_rows_i, sequential_mode=True + ).to_ndarray() assert num_rows_i == 0 or data_i.shape[0] == num_rows_i @@ -425,52 +396,49 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if squeeze: data = self._squeeze(data) - - return keys, data - + return keys, data class RandomAccessArkDataReader(RandomAccessDataReader): - """Class to read Ark files in random order, using scp file to - index the Ark files. - - Attributes: - file_path: scp file to read. - path_prefix: If input_spec is a scp file, it pre-appends - path_prefix string to the second column of - the scp file. This is useful when data - is read from a different directory of that - it was created. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file - it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). + """Class to read Ark files in random order, using scp file to + index the Ark files. + + Attributes: + file_path: scp file to read. + path_prefix: If input_spec is a scp file, it pre-appends + path_prefix string to the second column of + the scp file. This is useful when data + is read from a different directory of that + it was created. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + permissive: If True, if the data that we want to read is not in the file + it returns an empty matrix, if False it raises an exception. + scp_sep: Separator for scp files (default ' '). """ - - def __init__(self, file_path, path_prefix=None, - transform=None, permissive=False, scp_sep=' '): + + def __init__( + self, file_path, path_prefix=None, transform=None, permissive=False, scp_sep=" " + ): super(RandomAccessArkDataReader, self).__init__( - file_path, transform, permissive) - + file_path, transform, permissive + ) + self.scp = SCPList.load(self.file_path, sep=scp_sep) if path_prefix is not None: self.scp.add_prefix_to_filepath(path_prefix) - archives, archive_idx = np.unique( - self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) - self.locks = [ threading.Lock() for i in range(len(self.archives)) ] - + self.locks = [threading.Lock() for i in range(len(self.archives))] @property def keys(self): return self.scp.key - def close(self): """Closes all the open Ark files.""" for f in self.f: @@ -478,15 +446,13 @@ def close(self): f.close() self.f = [None] * len(self.f) - - def _open_archive(self, key_idx, offset=0): - """Opens the Ark file correspoding to a given feature/matrix - if it is not already open and moves the file pointer to the + """Opens the Ark file correspoding to a given feature/matrix + if it is not already open and moves the file pointer to the point where we can read that feature matrix. If the file was already open, it only moves the file pointer. - + Args: key_idx: Integer position of the feature matrix in the scp file. offset: Byte where we can find the feature matrix in the Ark file. @@ -498,20 +464,18 @@ def _open_archive(self, key_idx, offset=0): archive_idx = self.archive_idx[key_idx] with self.locks[archive_idx]: if self.f[archive_idx] is None: - self.f[archive_idx] = open(self.archives[archive_idx], 'rb') + self.f[archive_idx] = open(self.archives[archive_idx], "rb") f = self.f[archive_idx] f.seek(offset, 0) return f, self.locks[archive_idx] - - def read_num_rows(self, keys, assert_same_dim=True): """Reads the number of rows in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the number of rows. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -520,17 +484,14 @@ def read_num_rows(self, keys, assert_same_dim=True): Integer numpy array with the number of rows for the recordings in keys. """ shapes = self.read_shapes(keys, assert_same_dim) - num_rows = np.array([s[0] if len(s)==2 else 1 for s in shapes], - dtype=np.int) + num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=np.int) return num_rows - - def read_dims(self, keys, assert_same_dim=True): """Reads the number of columns in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the number of columns. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -541,16 +502,14 @@ def read_dims(self, keys, assert_same_dim=True): shapes = self.read_shapes(keys, False) dims = np.array([s[-1] for s in shapes], dtype=np.int) if assert_same_dim: - assert np.all(dims==dims[0]) + assert np.all(dims == dims[0]) return dims - - def read_shapes(self, keys, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the shapes. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -563,52 +522,47 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - + if not (key in self.scp): if self.permissive: shapes.append((0,)) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) index = self.scp.get_index(key) _, file_path, offset, range_spec = self.scp[index] - row_offset_i, num_rows_i = self._combine_ranges( - range_spec, 0, 0) - + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) + f, lock = self._open_archive(index) with lock: f.seek(offset, 0) binary = init_kaldi_input_stream(f) - shape_i = KaldiMatrix.read_shape( - f, binary, sequential_mode=False) + shape_i = KaldiMatrix.read_shape(f, binary, sequential_mode=False) + + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) - shape_i = self._apply_range_to_shape( - shape_i, row_offset_i, num_rows_i) - shapes.append(shape_i) if assert_same_dim: dims = np.array([s[-1] for s in shapes], dtype=np.int) assert np.all(dims == dims[0]) - - return shapes - + return shapes def read(self, keys, squeeze=False, row_offset=0, num_rows=0): """Reads the feature matrices/vectors for the recordings in keys. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the feature matrices/vectors. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -618,24 +572,26 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: assert len(num_rows) == len(keys) data = [] - for i,key in enumerate(keys): - + for i, key in enumerate(keys): + if not (key in self.scp): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) index = self.scp.get_index(key) _, file_path, offset, range_spec = self.scp[index] @@ -643,25 +599,25 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows row_offset_i, num_rows_i = self._combine_ranges( - range_spec, row_offset_i, num_rows_i) - + range_spec, row_offset_i, num_rows_i + ) + f, lock = self._open_archive(index) with lock: f.seek(offset, 0) binary = init_kaldi_input_stream(f) data_i = KaldiMatrix.read( - f, binary, row_offset_i, num_rows_i, - sequential_mode=False).to_ndarray() + f, binary, row_offset_i, num_rows_i, sequential_mode=False + ).to_ndarray() assert num_rows_i == 0 or data_i.shape[0] == num_rows_i if self.transform is not None: data_i = self.transform.predict(data_i) - + data.append(data_i) if squeeze: data = self._squeeze(data, self.permissive) - + return data - diff --git a/hyperion/io/ark_data_writer.py b/hyperion/io/ark_data_writer.py index 77eed2de..50fdd3f6 100644 --- a/hyperion/io/ark_data_writer.py +++ b/hyperion/io/ark_data_writer.py @@ -13,10 +13,9 @@ from .data_writer import DataWriter - class ArkDataWriter(DataWriter): """Class to write Ark feature files. - + Attributes: archive_path: output data file path. script_path: optional output scp file. @@ -24,33 +23,29 @@ class ArkDataWriter(DataWriter): flush: If True, it flushes the output after writing each feature file. compress: It True, it uses Kaldi compression. compression_method: Kaldi compression method: - {auto (default), speech_feat, + {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. scp_sep: Separator for scp files (default ' '). """ - def __init__(self, archive_path, script_path=None, - binary=True, **kwargs): - super(ArkDataWriter, self).__init__( - archive_path, script_path, **kwargs) + def __init__(self, archive_path, script_path=None, binary=True, **kwargs): + super(ArkDataWriter, self).__init__(archive_path, script_path, **kwargs) self.binary = binary if binary: - self.f = open(archive_path, 'wb') + self.f = open(archive_path, "wb") else: - self.f = open(archive_path, 'w') + self.f = open(archive_path, "w") if script_path is not None: - self.f_script = open(script_path, 'w') + self.f_script = open(script_path, "w") else: self.f_script = None - - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - + with ArkDataWriter('file.h5') as f: f.write(key, data) @@ -58,39 +53,31 @@ def __exit__(self, exc_type, exc_value, traceback): """ self.close() - - def close(self): """Closes the output file""" self.f.close() if self.f_script is not None: self.f_script.close() - - def flush(self): """Flushes the file""" self.f.flush() if self.f_script is not None: self.f_script.flush() - - def _convert_data(self, data): """Converts the feature matrix from numpy array to KaldiMatrix - or KaldiCompressedMatrix. + or KaldiCompressedMatrix. """ if isinstance(data, np.ndarray): data = data.astype(float_save(), copy=False) if self.compress: - return KaldiCompressedMatrix.compress( - data, self.compression_method) + return KaldiCompressedMatrix.compress(data, self.compression_method) return KaldiMatrix(data) - + if isinstance(data, KaldiMatrix): if self.compress: - return KaldiCompressedMatrix.compress( - data, self.compression_method) + return KaldiCompressedMatrix.compress(data, self.compression_method) return data if isinstance(data, KaldiCompressedMatrix): @@ -98,37 +85,36 @@ def _convert_data(self, data): return data.to_matrix() return data - raise ValueError('Data is not ndarray or KaldiMatrix') - + raise ValueError("Data is not ndarray or KaldiMatrix") - def write(self, keys, data): """Writes data to file. - + Args: key: List of recodings names. - data: List of Feature matrices or vectors. - If all the matrices have the same dimension + data: List of Feature matrices or vectors. + If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ if isinstance(keys, str): keys = [keys] data = [data] - + for i, key_i in enumerate(keys): - assert is_token(key_i), 'Token %s not valid' % key_i + assert is_token(key_i), "Token %s not valid" % key_i write_token(self.f, self.binary, key_i) pos = self.f.tell() data_i = self._convert_data(data[i]) - + init_kaldi_output_stream(self.f, self.binary) data_i.write(self.f, self.binary) if self.f_script is not None: - self.f_script.write('%s%s%s:%d\n' % ( - key_i, self.scp_sep, self.archive_path, pos)) - + self.f_script.write( + "%s%s%s:%d\n" % (key_i, self.scp_sep, self.archive_path, pos) + ) + if self._flush: self.flush() diff --git a/hyperion/io/audio_reader.py b/hyperion/io/audio_reader.py index 100118d2..c6bdeab8 100644 --- a/hyperion/io/audio_reader.py +++ b/hyperion/io/audio_reader.py @@ -15,23 +15,45 @@ from ..hyp_defs import float_cpu from ..utils import SCPList, SegmentList -valid_ext = ['.wav', '.flac', '.ogg' , '.au', '.avr', '.caf', '.htk', '.iff', '.mat', '.mpc', '.oga', '.pvf', '.rf64', '.sd2', '.sds', '.sf', '.voc', 'w64', '.wve', '.xi'] +valid_ext = [ + ".wav", + ".flac", + ".ogg", + ".au", + ".avr", + ".caf", + ".htk", + ".iff", + ".mat", + ".mpc", + ".oga", + ".pvf", + ".rf64", + ".sd2", + ".sds", + ".sf", + ".voc", + "w64", + ".wve", + ".xi", +] + class AudioReader(object): """Class to read audio files from wav, flac or pipe - Attributes: - file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. - segments_path: segments file with format: segment_id file_id tbeg tend - wav_scale: multiplies signal by scale factor + Attributes: + file_path: scp file with formant file_key wavspecifier (audio_file/pipe) or SCPList object. + segments_path: segments file with format: segment_id file_id tbeg tend + wav_scale: multiplies signal by scale factor """ - - def __init__(self, file_path, segments_path=None, wav_scale=2**15-1): + + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): self.file_path = file_path if isinstance(file_path, SCPList): self.scp = file_path else: - self.scp = SCPList.load(file_path, sep=' ', is_wav=True) + self.scp = SCPList.load(file_path, sep=" ", is_wav=True) self.segments_path = segments_path if segments_path is None: @@ -43,41 +65,37 @@ def __init__(self, file_path, segments_path=None, wav_scale=2**15-1): self.segments = segments_path else: self.segments = SegmentList.load( - segments_path, sep=' ', index_by_file=False) + segments_path, sep=" ", index_by_file=False + ) self.wav_scale = wav_scale - @property def keys(self): if self.with_segments: - return np.asarray(self.segments['segment_id']) + return np.asarray(self.segments["segment_id"]) return self.scp.key - def __enter__(self): """Function required when entering contructions of type - with AudioReader('file.h5') as f: - keys, data = f.read() + with AudioReader('file.h5') as f: + keys, data = f.read() """ return self - - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - with AudioReader('file.h5') as f: - keys, data = f.read() + with AudioReader('file.h5') as f: + keys, data = f.read() """ pass - @staticmethod - def read_wavspecifier(wavspecifier, scale=2**15, time_offset=0, time_dur=0): + def read_wavspecifier(wavspecifier, scale=2 ** 15, time_offset=0, time_dur=0): """Reads an audiospecifier (audio_file/pipe) - It reads from pipe or from all the files that can be read + It reads from pipe or from all the files that can be read by `libsndfile ` Args: @@ -89,12 +107,12 @@ def read_wavspecifier(wavspecifier, scale=2**15, time_offset=0, time_dur=0): """ wavspecifier = wavspecifier.strip() - if wavspecifier[-1] == '|': + if wavspecifier[-1] == "|": wavspecifier = wavspecifier[:-1] x, fs = AudioReader.read_pipe(wavspecifier, scale) - if time_offset == 0 and time_dur==0: + if time_offset == 0 and time_dur == 0: return x, fs - + start_sample = int(math.floor(time_offset * fs)) num_samples = int(math.floor(time_dur * fs)) if num_samples == 0: @@ -111,7 +129,7 @@ def read_wavspecifier(wavspecifier, scale=2**15, time_offset=0, time_dur=0): x *= scale return x, fs - with sf.SoundFile(wavspecifier, 'r') as f: + with sf.SoundFile(wavspecifier, "r") as f: fs = f.samplerate start_sample = int(math.floor(time_offset * fs)) num_samples = int(math.floor(time_dur * fs)) @@ -122,13 +140,10 @@ def read_wavspecifier(wavspecifier, scale=2**15, time_offset=0, time_dur=0): x = scale * f.read(dtype=float_cpu()) return x, fs - raise Exception('Unknown format for %s' % (wavspecifier)) - - - + raise Exception("Unknown format for %s" % (wavspecifier)) @staticmethod - def read_pipe(wavspecifier, scale=2**15): + def read_pipe(wavspecifier, scale=2 ** 15): """Reads wave file from a pipe Args: wavspecifier: Shell command with pipe output @@ -137,13 +152,15 @@ def read_pipe(wavspecifier, scale=2**15): # proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) proc = subprocess.Popen(wavspecifier, shell=True, stdout=subprocess.PIPE) pipe = proc.communicate()[0] - if proc.returncode !=0: - raise Exception('Wave read pipe command %s returned code %d' % (wavspecifier, proc.returncode)) + if proc.returncode != 0: + raise Exception( + "Wave read pipe command %s returned code %d" + % (wavspecifier, proc.returncode) + ) x, fs = sf.read(io.BytesIO(pipe), dtype=float_cpu()) x *= scale return x, fs - def _read_segment(self, segment, time_offset=0, time_dur=0): """Reads a wave segment @@ -152,9 +169,9 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): Returns: Wave, sampling frequency """ - file_id = segment['file_id'] - t_beg = segment['tbeg'] + time_offset - t_end = segment['tend'] + file_id = segment["file_id"] + t_beg = segment["tbeg"] + time_offset + t_end = segment["tend"] if time_dur > 0: t_end_new = t_beg + time_dur assert t_end_new <= t_end @@ -165,26 +182,31 @@ def _read_segment(self, segment, time_offset=0, time_dur=0): num_samples_i = len(x_i) s_beg = int(t_beg * fs_i) if s_beg >= num_samples_i: - raise Exception('segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)' % ( - key, tbeg, sbeg, file_id, num_samples_i)) + raise Exception( + "segment %s tbeg=%.2f (num_sample=%d) longer that wav file %s (num_samples=%d)" + % (key, tbeg, sbeg, file_id, num_samples_i) + ) s_end = int(t_end * fs_i) if s_end > num_samples_i or t_end < 0: s_end = num_samples_i - + x_i = x_i[s_beg:s_end] return x_i, fs_i - - def read(self): pass - class SequentialAudioReader(AudioReader): - - def __init__(self, file_path, segments_path=None, wav_scale=2**15-1, part_idx=1, num_parts=1): + def __init__( + self, + file_path, + segments_path=None, + wav_scale=2 ** 15 - 1, + part_idx=1, + num_parts=1, + ): super().__init__(file_path, segments_path, wav_scale=wav_scale) self.cur_item = 0 self.part_idx = part_idx @@ -193,10 +215,10 @@ def __init__(self, file_path, segments_path=None, wav_scale=2**15-1, part_idx=1, if self.with_segments: self.segments = self.segments.split(self.part_idx, self.num_parts) else: - self.scp = self.scp.split(self.part_idx, self.num_parts, group_by_key=False) - + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=False + ) - def __iter__(self): """Needed to build an iterator, e.g.: r = SequentialAudioReader(...) @@ -206,8 +228,6 @@ def __iter__(self): """ return self - - def __next__(self): """Needed to build an iterator, e.g.: r = SequentialAudioReader(...) @@ -215,24 +235,19 @@ def __next__(self): process(s) """ key, x, fs = self.read(1) - if len(key)==0: + if len(key) == 0: raise StopIteration return key[0], x[0], fs[0] - - def next(self): """__next__ for Python 2""" return self.__next__() - def reset(self): - """Returns the file pointer to the begining of the dataset, - then we can start reading the features again. + """Returns the file pointer to the begining of the dataset, + then we can start reading the features again. """ - self.cur_item=0 - - + self.cur_item = 0 def eof(self): """End of file. @@ -244,10 +259,9 @@ def eof(self): return self.cur_item == len(self.segments) return self.cur_item == len(self.scp) - def read(self, num_records=0, time_offset=0, time_durs=0): """Reads next num_records audio files - + Args: num_records: Number of audio files to read. time_offset: List of floats indicating the start time to read in the utterance. @@ -279,12 +293,13 @@ def read(self, num_records=0, time_offset=0, time_durs=0): if self.with_segments: segment = self.segments[self.cur_item] - key = segment['segment_id'] + key = segment["segment_id"] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: key, file_path, _, _ = self.scp[self.cur_item] x_i, fs_i = self.read_wavspecifier( - file_path, self.wav_scale, offset_i, dur_i) + file_path, self.wav_scale, offset_i, dur_i + ) keys.append(key) data.append(x_i) @@ -293,48 +308,54 @@ def read(self, num_records=0, time_offset=0, time_durs=0): return keys, data, fs - @staticmethod def filter_args(**kwargs): - valid_args = ('part_idx', 'num_parts','wav_scale') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("part_idx", "num_parts", "wav_scale") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'wav-scale', default=2**15-1, type=float, - help=('multiplicative factor for waveform')) + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) try: parser.add_argument( - p1+'part-idx', type=int, default=1, - help=('splits the list of files into num-parts and ' - 'processes part-idx')) + p1 + "part-idx", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) parser.add_argument( - p1+'num-parts', type=int, default=1, - help=('splits the list of files into num-parts and ' - 'processes part-idx')) + p1 + "num-parts", + type=int, + default=1, + help=( + "splits the list of files into num-parts and " "processes part-idx" + ), + ) except: pass add_argparse_args = add_class_args - -class RandomAccessAudioReader(AudioReader): - def __init__(self, file_path, segments_path=None, wav_scale=2**15-1): +class RandomAccessAudioReader(AudioReader): + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): super().__init__(file_path, segments_path, wav_scale) - - def _read(self, keys, time_offset=0, time_durs=0): """Reads the waveforms for the recordings in keys. - + Args: keys: List of recording/segment_ids names. @@ -349,34 +370,34 @@ def _read(self, keys, time_offset=0, time_durs=0): data = [] fs = [] - for i,key in enumerate(keys): + for i, key in enumerate(keys): offset_i = time_offset[i] if offset_is_list else time_offset dur_i = time_durs[i] if dur_is_list else time_durs if self.with_segments: if not (key in self.segments): - raise Exception('Key %s not found' % key) - + raise Exception("Key %s not found" % key) + segment = self.segments[key] x_i, fs_i = self._read_segment(segment, offset_i, dur_i) else: if not (key in self.scp): - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) file_path, _, _ = self.scp[key] x_i, fs_i = self.read_wavspecifier( - file_path, self.wav_scale, offset_i, dur_i) + file_path, self.wav_scale, offset_i, dur_i + ) data.append(x_i) fs.append(fs_i) return data, fs - def read(self, keys, time_offset=0, time_durs=0): """Reads the waveforms for the recordings in keys. - + Args: keys: List of recording/segment_ids names. @@ -385,8 +406,7 @@ def read(self, keys, time_offset=0, time_durs=0): fs: List of sampling freq. """ try: - x, fs = self._read(keys, time_offset=time_offset, - time_durs=time_durs) + x, fs = self._read(keys, time_offset=time_offset, time_durs=time_durs) except: if isinstance(keys, str): keys = [keys] @@ -402,41 +422,48 @@ def read(self, keys, time_offset=0, time_durs=0): # we try to read from # time-offset to the end of the file, and remove the extra frames later, # this solves the problem in most cases - logging.info(('error-1 reading at keys={} offset={} ' - 'retrying reading until end-of-file ...').format( - keys, time_offset)) + logging.info( + ( + "error-1 reading at keys={} offset={} " + "retrying reading until end-of-file ..." + ).format(keys, time_offset) + ) x, fs = self._read(keys, time_offset=time_offset) for i in range(len(x)): end_sample = int(time_durs[i] * fs[i]) x[i] = x[i][:end_sample] except: # try to read the full file - logging.info(('error-2 reading at key={}, ' - 'retrying reading full file ...').format(keys)) + logging.info( + ( + "error-2 reading at key={}, " "retrying reading full file ..." + ).format(keys) + ) x, fs = self._read(keys) for i in range(len(x)): start_sample = int(time_offset[i] * fs[i]) end_sample = start_sample + int(time_durs[i] * fs[i]) x[i] = x[i][start_sample:end_sample] - - return x, fs + return x, fs @staticmethod def filter_args(**kwargs): - valid_args = ('wav_scale',) - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("wav_scale",) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'wav-scale', default=2**15-1, type=float, - help=('multiplicative factor for waveform')) - + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/audio_writer.py b/hyperion/io/audio_writer.py index 4a365b86..2fb9ce3c 100644 --- a/hyperion/io/audio_writer.py +++ b/hyperion/io/audio_writer.py @@ -14,33 +14,50 @@ from ..utils.kaldi_io_funcs import is_token from .audio_reader import valid_ext -subtype_to_npdtype = {'PCM_32': 'int32', 'ALAW': 'int16', - 'IMA_ADPCM': 'int16', 'FLOAT': 'float32', - 'PCM_16': 'int16', 'DOUBLE': 'float64', - 'MS_ADPCM': 'int16', 'ULAW': 'int16', - 'PCM_U8': 'uint8', 'PCM_S8': 'int8', 'VORBIS': 'float32', - 'GSM610': 'int16', 'G721_32': 'int16', 'PCM_24': 'int24'} +subtype_to_npdtype = { + "PCM_32": "int32", + "ALAW": "int16", + "IMA_ADPCM": "int16", + "FLOAT": "float32", + "PCM_16": "int16", + "DOUBLE": "float64", + "MS_ADPCM": "int16", + "ULAW": "int16", + "PCM_U8": "uint8", + "PCM_S8": "int8", + "VORBIS": "float32", + "GSM610": "int16", + "G721_32": "int16", + "PCM_24": "int24", +} + class AudioWriter(object): """Abstract base class to write audio files. - + Attributes: output_path: output data file path. script_path: optional output scp file. - audio_format: audio file format - audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], + audio_format: audio file format + audio_subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) scp_sep: Separator for scp files (default ' '). """ - def __init__(self, output_path, script_path=None, - audio_format='wav', audio_subtype=None, scp_sep=' '): + def __init__( + self, + output_path, + script_path=None, + audio_format="wav", + audio_subtype=None, + scp_sep=" ", + ): self.output_path = output_path self.script_path = script_path self.audio_format = audio_format self.scp_sep = scp_sep - assert '.' + self.audio_format in valid_ext + assert "." + self.audio_format in valid_ext if audio_subtype is None: self.subtype = sf.default_subtype(self.audio_format) else: @@ -54,44 +71,38 @@ def __init__(self, output_path, script_path=None, pass if script_path is not None: - self.f_script = open(script_path, 'w') + self.f_script = open(script_path, "w") else: self.f_script = None - def __enter__(self): """Function required when entering contructions of type - - with AudioWriter('./path') as f: - f.write(key, data) + + with AudioWriter('./path') as f: + f.write(key, data) """ return self - - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - - with AudioWriter('./path') as f: - f.write(key, data) + + with AudioWriter('./path') as f: + f.write(key, data) """ self.close() - - def close(self): """Closes the script file if open""" if self.f_script is not None: self.f_script.close() - def write(self, keys, data, fs): """Writes waveform to audio file. - + Args: key: List of recodings names. data: List of waveforms - fs: + fs: """ if isinstance(keys, str): keys = [keys] @@ -102,47 +113,60 @@ def write(self, keys, data, fs): dtype = subtype_to_npdtype[self.subtype] output_files = [] for i, key_i in enumerate(keys): - assert is_token(key_i), 'Token %s not valid' % key_i - file_basename = re.sub('/', '-', key_i) - output_file = '%s/%s.%s' % (self.output_path, file_basename, self.audio_format) + assert is_token(key_i), "Token %s not valid" % key_i + file_basename = re.sub("/", "-", key_i) + output_file = "%s/%s.%s" % ( + self.output_path, + file_basename, + self.audio_format, + ) fs_i = fs[i] if fs_is_list else fs data_i = data[i].astype(dtype, copy=False) sf.write(output_file, data_i, fs_i, subtype=self.subtype) - + output_files.append(output_file) if self.f_script is not None: - self.f_script.write('%s%s%s\n' % ( - key_i, self.scp_sep, output_file)) + self.f_script.write("%s%s%s\n" % (key_i, self.scp_sep, output_file)) self.f_script.flush() return output_files - @staticmethod def filter_args(**kwargs): - valid_args = ('output_fs','output_wav_scale', 'output_audio_format', 'output_audio_subtype') - return dict((re.sub('output_','', k), kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ( + "output_fs", + "output_wav_scale", + "output_audio_format", + "output_audio_subtype", + ) + return dict( + (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs + ) - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - + p1 = "--" + prefix + "." + # parser.add_argument(p1+'output-wav-scale', default=1, type=float, # help=('scale to divide the waveform before writing')) - parser.add_argument(p1+'output-audio-format', default='flac', - choices=['flac','ogg', 'wav'], - help=('ouput audio format')) - - parser.add_argument(p1+'output-audio-subtype', default=None, - choices=['pcm_16','pcm_24', 'float', 'double', 'vorbis'], - help=('coding format for audio file')) + parser.add_argument( + p1 + "output-audio-format", + default="flac", + choices=["flac", "ogg", "wav"], + help=("ouput audio format"), + ) + + parser.add_argument( + p1 + "output-audio-subtype", + default=None, + choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], + help=("coding format for audio file"), + ) # parser.add_argument(p1+'output-fs', default=16000, type=int, # help=('output sample frequency')) diff --git a/hyperion/io/bin_vad_reader.py b/hyperion/io/bin_vad_reader.py index 46471f2d..452eb106 100644 --- a/hyperion/io/bin_vad_reader.py +++ b/hyperion/io/bin_vad_reader.py @@ -11,10 +11,17 @@ from .vad_reader import VADReader from .data_rw_factory import RandomAccessDataReaderFactory as DRF -class BinVADReader(VADReader): - def __init__(self, rspecifier, path_prefix=None, scp_sep=' ', - frame_length=25, frame_shift=10, snip_edges=False): +class BinVADReader(VADReader): + def __init__( + self, + rspecifier, + path_prefix=None, + scp_sep=" ", + frame_length=25, + frame_shift=10, + snip_edges=False, + ): r = DRF.create(rspecifier, path_prefix, scp_sep=scp_sep) super().__init__(r.file_path, r.permissive) @@ -23,14 +30,20 @@ def __init__(self, rspecifier, path_prefix=None, scp_sep=' ', self.frame_length = frame_length self.snip_edges = snip_edges - def read_num_frames(self, keys): return self.r.read_dims(keys, assert_same_dim=False) - - def read(self, keys, squeeze=False, offset=0, num_frames=0, - frame_length=25, frame_shift=10, snip_edges=False, - signal_lengths=None): + def read( + self, + keys, + squeeze=False, + offset=0, + num_frames=0, + frame_length=25, + frame_shift=10, + snip_edges=False, + signal_lengths=None, + ): if isinstance(keys, str): keys = [keys] @@ -40,7 +53,8 @@ def read(self, keys, squeeze=False, offset=0, num_frames=0, assert snip_edges == self.snip_edges offset_is_list, num_frames_is_list = self._assert_offsets_num_frames( - keys, offset, num_frames) + keys, offset, num_frames + ) vad = self.r.read(keys) output_vad = [] @@ -50,13 +64,12 @@ def read(self, keys, squeeze=False, offset=0, num_frames=0, num_frames_i = num_frames[i] if num_frames_is_list else num_frames vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i) output_vad.append(vad_i) - + if squeeze: output_vad = self.r._squeeeze(output_vad, self.permissive) return output_vad - def read_timestamps(self, keys, merge_tol=0.001): if isinstance(keys, str): keys = [keys] @@ -66,9 +79,12 @@ def read_timestamps(self, keys, merge_tol=0.001): for i in range(len(keys)): vad_i = vad[i].astype(np.bool, copy=False) ts_i = bin_vad_to_timestamps( - vad_i, self.frame_length/1000, self.frame_shift/1000, - self.snip_edges, merge_tol) + vad_i, + self.frame_length / 1000, + self.frame_shift / 1000, + self.snip_edges, + merge_tol, + ) ts.append(ts_i) return ts - diff --git a/hyperion/io/copy_feats.py b/hyperion/io/copy_feats.py index ab6067d8..519e3f5c 100644 --- a/hyperion/io/copy_feats.py +++ b/hyperion/io/copy_feats.py @@ -11,12 +11,21 @@ class CopyFeats(object): - """ Class to convet between Ark/hdf5 feature formats. - """ - - def __init__(self, input_spec, output_spec, path_prefix=None, - compress=False, compression_method='auto', write_num_frames=None, - scp_sep=' ', part_idx=1, num_parts=1, chunk_size=1): + """Class to convet between Ark/hdf5 feature formats.""" + + def __init__( + self, + input_spec, + output_spec, + path_prefix=None, + compress=False, + compression_method="auto", + write_num_frames=None, + scp_sep=" ", + part_idx=1, + num_parts=1, + chunk_size=1, + ): """CopyFeats constructor, it executes the conversion. Args: @@ -38,77 +47,81 @@ def __init__(self, input_spec, output_spec, path_prefix=None, h5,scp:file.h5,file.scp ark,scp:file.ark,file.scp - path_prefix: If input_spec is a scp file, it pre-appends - path_prefix string to the second column of - the scp file. This is useful when data - is read from a different directory of that + path_prefix: If input_spec is a scp file, it pre-appends + path_prefix string to the second column of + the scp file. This is useful when data + is read from a different directory of that it was created. compress: if True, it compress the features (default: False). compression_method: Kaldi compression method: - {auto (default), speech_feat, + {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. scp_sep: Separator for scp files (default ' '). - part_idx: It splits the input into num_parts and writes only + part_idx: It splits the input into num_parts and writes only part part_idx, where part_idx=1,...,num_parts. num_parts: Number of parts to split the input data. - chunk_size: When copying, it reads the input files in groups of + chunk_size: When copying, it reads the input files in groups of chunk_size (default:1). """ if isinstance(input_spec, str): input_spec = [input_spec] - assert not(num_parts>1 and len(input_spec)>1), ( - 'Merging and splitting at the same time is not supported') + assert not ( + num_parts > 1 and len(input_spec) > 1 + ), "Merging and splitting at the same time is not supported" if write_num_frames is not None: - f_nf = open(write_num_frames, 'w') + f_nf = open(write_num_frames, "w") + + logging.info("opening output stream: %s" % (output_spec)) + with DWF.create( + output_spec, + compress=compress, + compression_method=compression_method, + scp_sep=scp_sep, + ) as writer: - logging.info('opening output stream: %s' % (output_spec)) - with DWF.create(output_spec, - compress=compress, compression_method=compression_method, - scp_sep=scp_sep) as writer: - for rspec in input_spec: - logging.info('opening input stream: %s' % (rspec)) - with DRF.create(rspec, path_prefix=path_prefix, scp_sep=scp_sep, - part_idx=part_idx, num_parts=num_parts) as reader: + logging.info("opening input stream: %s" % (rspec)) + with DRF.create( + rspec, + path_prefix=path_prefix, + scp_sep=scp_sep, + part_idx=part_idx, + num_parts=num_parts, + ) as reader: while not reader.eof(): key, data = reader.read(chunk_size) if len(key) == 0: break - logging.info('copying %d feat matrices' % (len(key))) + logging.info("copying %d feat matrices" % (len(key))) writer.write(key, data) if write_num_frames is not None: - for k,v in zip(key, data): - f_nf.write('%s %d\n' % (k, v.shape[0])) - + for k, v in zip(key, data): + f_nf.write("%s %d\n" % (k, v.shape[0])) + if write_num_frames is not None: f_nf.close() - - @staticmethod def filter_args(**kwargs): """Extracts the relevant arguments for the CopyFeats object. - + Args: kwargs: Dictionary containing arguments for several classes. - + Returns: Dictionary with the relevant arguments to initialize the object. """ - valid_args = ('scp_sep', 'path_prefix', 'part_idx', 'num_parts') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_class_args(parser, prefix=None): """Adds arguments required to initialize the object to python argparse object. - + Args: parser: Python argparse object. prefix: Prefix for the argument names. The prefix is useful when you have @@ -116,28 +129,32 @@ def add_class_args(parser, prefix=None): initialize each of them with different arguments. """ if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - + p1 = "--" + prefix + "." + parser.add_argument( - p1+'scp-sep', default=' ', - help=('scp file field separator')) + p1 + "scp-sep", default=" ", help=("scp file field separator") + ) parser.add_argument( - p1+'path-prefix', default=None, - help=('scp file_path prefix')) + p1 + "path-prefix", default=None, help=("scp file_path prefix") + ) parser.add_argument( - p1+'part-idx', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) + p1 + "part-idx", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) parser.add_argument( - p1+'num-parts', type=int, default=1, - help=('splits the list of files in num-parts and process part_idx')) + p1 + "num-parts", + type=int, + default=1, + help=("splits the list of files in num-parts and process part_idx"), + ) + parser.add_argument(p1 + "compress", default=False, action="store_true") parser.add_argument( - p1+'compress', default=False, action='store_true') - parser.add_argument( - p1+'compression-method', default='auto', - choices=compression_methods) - + p1 + "compression-method", default="auto", choices=compression_methods + ) add_argparse_args = add_class_args diff --git a/hyperion/io/data_reader.py b/hyperion/io/data_reader.py index c264e278..f0c61d3a 100644 --- a/hyperion/io/data_reader.py +++ b/hyperion/io/data_reader.py @@ -15,16 +15,17 @@ class DataReader(object): __metaclass__ = ABCMeta + def __init__(self, file_path, transform=None, permissive=False): """Abstract base class to read Ark or hdf5 feature files. - + Attributes: file_path: h5, ark or scp file to read. - transform: TransformList object, applies a transformation to the + transform: TransformList object, applies a transformation to the features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file + permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. - + """ self.file_path = file_path self.permissive = permissive @@ -32,36 +33,28 @@ def __init__(self, file_path, transform=None, permissive=False): self.transform = TransformList.load(transform) else: self.transform = transform - - def __enter__(self): """Function required when entering contructions of type - with DataReader('file.h5') as f: - keys, data = f.read() + with DataReader('file.h5') as f: + keys, data = f.read() """ return self - - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - with DataReader('file.h5') as f: - keys, data = f.read() + with DataReader('file.h5') as f: + keys, data = f.read() """ self.close() - - @abstractmethod def close(self): """Closes input file.""" pass - - @staticmethod def _squeeze(data, permissive=False): """Converts list of matrices to 3D numpy array or @@ -81,23 +74,21 @@ def _squeeze(data, permissive=False): for i in range(len(data)): if len(data[i]) == 0: if permissive: - data[i] = np.zeros((1,)+shape, dtype=float_cpu()) + data[i] = np.zeros((1,) + shape, dtype=float_cpu()) continue assert ndim == data[i].ndim assert shape[-1] == data[i].shape[-1] data[i] = np.expand_dims(data[i], axis=0) - + return np.concatenate(tuple(data), axis=0) - - - + @staticmethod def _combine_ranges(read_range, row_offset, num_rows): """Combines two frame ranges. One is the range in the scp file, e.g, in the scp file recording1 file1.ark:34[3:40] recording2 file1.ark:100[5:20] - + [3:40] and [5:20] are frame ranges. The user can decide to just read a submatrix of that, e.g., @@ -106,7 +97,7 @@ def _combine_ranges(read_range, row_offset, num_rows): row_offset=4 (3+1) and num_rows=10. Args: - read_range: Frame range from scp file. It is a tuple with the + read_range: Frame range from scp file. It is a tuple with the first row and number of rows to read. row_offset: User defined row_offset. num_rows: User defined number of rows to read, it it is 0, we read @@ -128,8 +119,6 @@ def _combine_ranges(read_range, row_offset, num_rows): row_offset = row_offset + read_range[0] return row_offset, num_rows - - @staticmethod def _apply_range_to_shape(shape, row_offset, num_rows): """Modifies shape given the user defined row_offset and num_rows to read. @@ -155,36 +144,39 @@ def _apply_range_to_shape(shape, row_offset, num_rows): return shape - - class SequentialDataReader(DataReader): """Abstract base class to read Ark or hdf5 feature files in - sequential order. - - Attributes: - file_path: h5, ark or scp file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file - it returns an empty matrix, if False it raises an exception. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + sequential order. + + Attributes: + file_path: h5, ark or scp file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + permissive: If True, if the data that we want to read is not in the file + it returns an empty matrix, if False it raises an exception. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ - + __metaclass__ = ABCMeta - def __init__(self, file_path, transform=None, permissive=False, - part_idx=1, num_parts=1, split_by_key=False): + def __init__( + self, + file_path, + transform=None, + permissive=False, + part_idx=1, + num_parts=1, + split_by_key=False, + ): super().__init__(file_path, transform, permissive) self.lock = multiprocessing.Lock() self.part_idx = part_idx self.num_parts = num_parts self.split_by_key = split_by_key - - def __iter__(self): """Needed to build an iterator, e.g.: r = SequentialDataReader(...) @@ -193,8 +185,6 @@ def __iter__(self): """ return self - - def __next__(self): """Needed to build an iterator, e.g.: r = SequentialDataReader(...) @@ -202,27 +192,21 @@ def __next__(self): print(key, data) """ key, data = self.read(1) - if len(key)==0: + if len(key) == 0: raise StopIteration return key[0], data[0] - - def next(self): """__next__ for Python 2""" return self.__next__() - - @abstractmethod def reset(self): - """Returns the file pointer to the begining of the dataset, - then we can start reading the features again. + """Returns the file pointer to the begining of the dataset, + then we can start reading the features again. """ pass - - @abstractmethod def eof(self): """End of file. @@ -231,15 +215,13 @@ def eof(self): True, when we have read all the recordings in the dataset. """ return False - - @abstractmethod def read_num_rows(self, num_records=0, assert_same_dim=True): """Reads the number of rows in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -250,14 +232,12 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): """ pass - - @abstractmethod def read_dims(self, num_records=0, assert_same_dim=True): """Reads the number of columns in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -268,14 +248,12 @@ def read_dims(self, num_records=0, assert_same_dim=True): """ pass - - @abstractmethod def read_shapes(self, num_records=0, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -286,20 +264,18 @@ def read_shapes(self, num_records=0, assert_same_dim=True): """ pass - - @abstractmethod def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): """Reads next num_records feature matrices/vectors. - + Args: num_records: Number of feature matrices to read. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -310,33 +286,29 @@ def read(self, num_records=0, squeeze=False, offset=0, num_rows=0): pass - - class RandomAccessDataReader(DataReader): __metaclass__ = ABCMeta def __init__(self, file_path, transform=None, permissive=False): """Abstract base class to read Ark or hdf5 feature files in random order. - + Attributes: file_path: h5 or scp file to read. - transform: TransformList object, applies a transformation to the + transform: TransformList object, applies a transformation to the features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file + permissive: If True, if the data that we want to read is not in the file it returns an empty matrix, if False it raises an exception. """ - - super().__init__(file_path, transform, permissive) + super().__init__(file_path, transform, permissive) - @abstractmethod def read_num_rows(self, keys=None, assert_same_dim=True): """Reads the number of rows in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the number of rows. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -346,14 +318,12 @@ def read_num_rows(self, keys=None, assert_same_dim=True): """ pass - - @abstractmethod def read_dims(self, keys=None, assert_same_dim=True): """Reads the number of columns in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the number of columns. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -363,14 +333,12 @@ def read_dims(self, keys=None, assert_same_dim=True): """ pass - - @abstractmethod def read_shapes(self, keys=None, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the shapes. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -380,21 +348,19 @@ def read_shapes(self, keys=None, assert_same_dim=True): """ pass - - @abstractmethod def read(self, keys, squeeze=False, offset=0, num_rows=0): """Reads the feature matrices/vectors for the recordings in keys. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the feature matrices/vectors. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. diff --git a/hyperion/io/data_rw_factory.py b/hyperion/io/data_rw_factory.py index 9c47e241..ed408156 100644 --- a/hyperion/io/data_rw_factory.py +++ b/hyperion/io/data_rw_factory.py @@ -20,187 +20,181 @@ from .h5_data_reader import RandomAccessH5ScriptDataReader as RH5SDR - class DataWriterFactory(object): """ Class to create object that write data to hdf5/ark files. """ @staticmethod - def create(wspecifier, compress=False, compression_method='auto', scp_sep=' '): + def create(wspecifier, compress=False, compression_method="auto", scp_sep=" "): if isinstance(wspecifier, str): wspecifier = WSpecifier.create(wspecifier) - if (wspecifier.spec_type == WSpecType.ARCHIVE or - wspecifier.spec_type == WSpecType.BOTH): - + if ( + wspecifier.spec_type == WSpecType.ARCHIVE + or wspecifier.spec_type == WSpecType.BOTH + ): + if wspecifier.archive_type == ArchiveType.H5: - return H5DW(wspecifier.archive, wspecifier.script, - flush=wspecifier.flush, - compress=compress, - compression_method=compression_method, - scp_sep=scp_sep) + return H5DW( + wspecifier.archive, + wspecifier.script, + flush=wspecifier.flush, + compress=compress, + compression_method=compression_method, + scp_sep=scp_sep, + ) else: - return ADW(wspecifier.archive, wspecifier.script, - binary=wspecifier.binary, flush=wspecifier.flush, - compress=compress, - compression_method=compression_method, - scp_sep=scp_sep) - + return ADW( + wspecifier.archive, + wspecifier.script, + binary=wspecifier.binary, + flush=wspecifier.flush, + compress=compress, + compression_method=compression_method, + scp_sep=scp_sep, + ) @staticmethod def filter_args(**kwargs): - valid_args = ('scp_sep', 'compress', 'compression_method') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("scp_sep", "compress", "compression_method") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") + parser.add_argument("--scp-sep", default=" ", help=("scp file field separator")) + parser.add_argument("--compress", default=False, action="store_true") parser.add_argument( - '--scp-sep', default=' ', - help=('scp file field separator')) - parser.add_argument('--compress', default=False, action='store_true') - parser.add_argument('--compression-method', default='auto', - choices=compression_methods) + "--compression-method", default="auto", choices=compression_methods + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='data writer options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='data writer options') - - -class SequentialDataReaderFactory(object): +class SequentialDataReaderFactory(object): @staticmethod - def create(rspecifier, path_prefix=None, scp_sep=' ', **kwargs): - + def create(rspecifier, path_prefix=None, scp_sep=" ", **kwargs): + if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) - - if rspecifier.spec_type == RSpecType.ARCHIVE: + + if rspecifier.spec_type == RSpecType.ARCHIVE: if rspecifier.archive_type == ArchiveType.H5: return SH5FDR(rspecifier.archive, **kwargs) else: return SAFDR(rspecifier.archive, **kwargs) else: if rspecifier.archive_type == ArchiveType.H5: - return SH5SDR(rspecifier.script, path_prefix, - scp_sep=scp_sep, **kwargs) + return SH5SDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) else: - return SASDR(rspecifier.script, path_prefix, - scp_sep=scp_sep, **kwargs) - - + return SASDR(rspecifier.script, path_prefix, scp_sep=scp_sep, **kwargs) @staticmethod def filter_args(**kwargs): - valid_args = ('scp_sep', 'path_prefix', 'part_idx', 'num_parts') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("scp_sep", "path_prefix", "part_idx", "num_parts") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - + parser = ArgumentParser(prog="") + try: parser.add_argument( - '--scp-sep', default=' ', - help=('scp file field separator')) + "--scp-sep", default=" ", help=("scp file field separator") + ) except: pass parser.add_argument( - '--path-prefix', default=None, - help=('scp file_path prefix')) + "--path-prefix", default=None, help=("scp file_path prefix") + ) try: parser.add_argument( - '--part-idx', type=int, default=1, - help=('splits the list of files in num-parts ' - 'and process part_idx')) + "--part-idx", + type=int, + default=1, + help=("splits the list of files in num-parts " "and process part_idx"), + ) parser.add_argument( - '--num-parts', type=int, default=1, - help=('splits the list of files in num-parts ' - 'and process part_idx')) + "--num-parts", + type=int, + default=1, + help=("splits the list of files in num-parts " "and process part_idx"), + ) except: pass if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='data reader options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='data reader options') - class RandomAccessDataReaderFactory(object): - @staticmethod - def create(rspecifier, path_prefix=None, transform=None, scp_sep=' '): + def create(rspecifier, path_prefix=None, transform=None, scp_sep=" "): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) logging.debug(rspecifier.__dict__) - if rspecifier.spec_type == RSpecType.ARCHIVE: + if rspecifier.spec_type == RSpecType.ARCHIVE: if rspecifier.archive_type == ArchiveType.H5: - return RH5FDR(rspecifier.archive, - transform=transform, - permissive=rspecifier.permissive) + return RH5FDR( + rspecifier.archive, + transform=transform, + permissive=rspecifier.permissive, + ) else: raise ValueError( - 'Random access to Ark file %s needs a script file' % - rspecifier.archive) + "Random access to Ark file %s needs a script file" + % rspecifier.archive + ) else: if rspecifier.archive_type == ArchiveType.H5: - return RH5SDR(rspecifier.archive, path_prefix, - transform=transform, - permissive=rspecifier.permissive, - scp_sep=scp_sep) + return RH5SDR( + rspecifier.archive, + path_prefix, + transform=transform, + permissive=rspecifier.permissive, + scp_sep=scp_sep, + ) else: - return RADR(rspecifier.script, path_prefix, - transform=transform, - permissive=rspecifier.permissive, - scp_sep=scp_sep) - + return RADR( + rspecifier.script, + path_prefix, + transform=transform, + permissive=rspecifier.permissive, + scp_sep=scp_sep, + ) @staticmethod def filter_args(**kwargs): - valid_args = ('scp_sep', 'path_prefix') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("scp_sep", "path_prefix") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - + parser = ArgumentParser(prog="") + try: parser.add_argument( - '--scp-sep', default=' ', - help=('scp file field separator')) + "--scp-sep", default=" ", help=("scp file field separator") + ) except: pass parser.add_argument( - '--path-prefix', default=None, - help=('scp file_path prefix')) + "--path-prefix", default=None, help=("scp file_path prefix") + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='data reader options') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='data reader options') add_argparse_args = add_class_args diff --git a/hyperion/io/data_writer.py b/hyperion/io/data_writer.py index cd427542..cf2bb4f9 100644 --- a/hyperion/io/data_writer.py +++ b/hyperion/io/data_writer.py @@ -9,29 +9,37 @@ class DataWriter(object): """Abstract base class to write Ark or hdf5 feature files. - + Attributes: archive_path: output data file path. script_path: optional output scp file. flush: If True, it flushes the output after writing each feature matrix. compress: It True, it uses Kaldi compression. compression_method: Kaldi compression method: - {auto (default), speech_feat, + {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. scp_sep: Separator for scp files (default ' '). """ + __metaclass__ = ABCMeta - def __init__(self, archive_path, script_path=None, - flush=False, compress=False, compression_method='auto', scp_sep=' '): + def __init__( + self, + archive_path, + script_path=None, + flush=False, + compress=False, + compression_method="auto", + scp_sep=" ", + ): self.archive_path = archive_path self.script_path = script_path self._flush = flush self.compress = compress self.compression_method = compression_method self.scp_sep = scp_sep - + archive_dir = os.path.dirname(archive_path) if not os.path.exists(archive_dir): os.makedirs(archive_dir) @@ -40,52 +48,42 @@ def __init__(self, archive_path, script_path=None, script_dir = os.path.dirname(script_path) if not os.path.exists(script_dir): os.makedirs(script_dir) - - def __enter__(self): """Function required when entering contructions of type - - with DataWriter('file.h5') as f: - f.write(key, data) + + with DataWriter('file.h5') as f: + f.write(key, data) """ return self - - @abstractmethod def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - - with DataWriter('file.h5') as f: - f.write(key, data) + + with DataWriter('file.h5') as f: + f.write(key, data) """ pass - - @abstractmethod def close(self): """Closes the output file""" pass - - @abstractmethod def flush(self): """Flushes the file""" pass - - @abstractmethod def write(self, key, data): """Writes data to file. - + Args: key: List of recodings names. - data: List of Feature matrices or vectors. - If all the matrices have the same dimension + data: List of Feature matrices or vectors. + If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ diff --git a/hyperion/io/h5_data_reader.py b/hyperion/io/h5_data_reader.py index fc8f4eec..7ade2549 100644 --- a/hyperion/io/h5_data_reader.py +++ b/hyperion/io/h5_data_reader.py @@ -19,7 +19,6 @@ from .data_reader import SequentialDataReader, RandomAccessDataReader - def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): """Auxiliary function to read the feature matrix from hdf5 dataset. It decompresses the data if it was compressed. @@ -29,7 +28,7 @@ def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): row_offset: First row to read from each feature matrix. num_rows: Number of rows to read from the feature matrix. If 0 it reads all the rows. - transform: TransformList object, applies a transformation to the + transform: TransformList object, applies a transformation to the features after reading them from disk. Returns: @@ -39,17 +38,18 @@ def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): if num_rows == 0: data = dset[row_offset:] else: - data = dset[row_offset:row_offset+num_rows] + data = dset[row_offset : row_offset + num_rows] elif num_rows > 0: data = dset[:num_rows] else: data = dset - if 'data_format' in dset.attrs: + if "data_format" in dset.attrs: if not isinstance(data, np.ndarray): data = np.asarray(data) data = KaldiCompressedMatrix.build_from_data_attrs( - data, dset.attrs).to_ndarray() + data, dset.attrs + ).to_ndarray() assert num_rows == 0 or data.shape[0] == num_rows @@ -57,23 +57,20 @@ def _read_h5_data(dset, row_offset=0, num_rows=0, transform=None): if transform is not None: data = transform.predict(data) return data - - - class SequentialH5DataReader(SequentialDataReader): """Abstract base class to read hdf5 feature files in - sequential order. - - Attributes: - file_path: ark or scp file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + sequential order. + + Attributes: + file_path: ark or scp file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ def __init__(self, file_path, **kwargs): @@ -82,33 +79,27 @@ def __init__(self, file_path, **kwargs): self.cur_file = None self.cur_item = 0 - - def close(self): """Closes current hdf5 file.""" if self.f is not None: self.f.close() self.f = None - - def _open_archive(self, file_path): """Opens the hdf5 file where the next matrix/vector is - if it is not open. - If there was another hdf5 file open, it closes it. + if it is not open. + If there was another hdf5 file open, it closes it. """ if self.f is None or file_path != self.cur_file: self.close() self.cur_file = file_path - self.f = h5py.File(file_path, 'r') - + self.f = h5py.File(file_path, "r") - def read_num_rows(self, num_records=0, assert_same_dim=True): """Reads the number of rows in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -118,16 +109,14 @@ def read_num_rows(self, num_records=0, assert_same_dim=True): Integer numpy array with num_records number of rows. """ keys, shapes = self.read_shapes(num_records, assert_same_dim) - num_rows = np.array([s[0] if len(s)==2 else 1 for s in shapes], dtype=int) + num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return keys, num_rows - - def read_dims(self, num_records=0, assert_same_dim=True): """Reads the number of columns in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -138,64 +127,55 @@ def read_dims(self, num_records=0, assert_same_dim=True): """ keys, shapes = self.read_shapes(num_records, False) dims = np.array([s[-1] for s in shapes], dtype=np.int32) - if assert_same_dim and len(dims)>0: - assert np.all(dims==dims[0]) + if assert_same_dim and len(dims) > 0: + assert np.all(dims == dims[0]) return keys, dims - - class SequentialH5FileDataReader(SequentialH5DataReader): """Class to read feature matrices/vectors in - sequential order from a single hdf5 file. - - Attributes: - file_path: Ark file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + sequential order from a single hdf5 file. + + Attributes: + file_path: Ark file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ def __init__(self, file_path, **kwargs): - super().__init__( - file_path, permissive=False, **kwargs) + super().__init__(file_path, permissive=False, **kwargs) self._open_archive(self.file_path) self._keys = list(self.f.keys()) if self.num_parts > 1: if self.split_by_key: self._keys, _ = split_list_group_by_key( - self._keys, self.part_idx, self.num_parts) + self._keys, self.part_idx, self.num_parts + ) else: self._keys, _ = split_list(self._keys, self.part_idx, self.num_parts) - @property def keys(self): return self._keys - - def reset(self): """Puts the file pointer back to the begining of the file""" if self.f is not None: self.cur_item = 0 - - def eof(self): """Returns True when it reaches the end of the ark file.""" return self.cur_item == len(self._keys) - - def read_shapes(self, num_records=0, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -216,26 +196,24 @@ def read_shapes(self, num_records=0, assert_same_dim=True): keys.append(key) shapes.append(self.f[key].shape) self.cur_item += 1 - - if assert_same_dim and len(shapes)>0: + + if assert_same_dim and len(shapes) > 0: dims = np.array([s[-1] for s in shapes], dtype=np.int32) assert np.all(dims == dims[0]) - - return keys, shapes - + return keys, shapes def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): """Reads next num_records feature matrices/vectors. - + Args: num_records: Number of feature matrices to read. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -246,10 +224,12 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self._keys) - self.cur_item - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) keys = [] data = [] with self.lock: @@ -257,12 +237,11 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if self.eof(): break - key_i = self._keys[self.cur_item] - + row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows - + dset_i = self.f[key_i] data_i = _read_h5_data(dset_i, row_offset_i, num_rows_i, self.transform) @@ -271,51 +250,46 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): keys.append(key_i) data.append(data_i) - if squeeze: data = self._squeeze(data) - - return keys, data - + return keys, data class SequentialH5ScriptDataReader(SequentialH5DataReader): """Class to read features from multiple hdf5 files where a scp file - indicates which hdf5 file contains each feature matrix. - - Attributes: - file_path: scp file to read. - path_prefix: If input_spec is a scp file, it pre-appends - path_prefix string to the second column of - the scp file. This is useful when data - is read from a different directory of that - it was created. - scp_sep: Separator for scp files (default ' '). - transform: TransformList object, applies a transformation to the - features after reading them from disk. - part_idx: It splits the input into num_parts and writes only - part part_idx, where part_idx=1,...,num_parts. - num_parts: Number of parts to split the input data. - split_by_key: If True, all the elements with the same key go to the same part. + indicates which hdf5 file contains each feature matrix. + + Attributes: + file_path: scp file to read. + path_prefix: If input_spec is a scp file, it pre-appends + path_prefix string to the second column of + the scp file. This is useful when data + is read from a different directory of that + it was created. + scp_sep: Separator for scp files (default ' '). + transform: TransformList object, applies a transformation to the + features after reading them from disk. + part_idx: It splits the input into num_parts and writes only + part part_idx, where part_idx=1,...,num_parts. + num_parts: Number of parts to split the input data. + split_by_key: If True, all the elements with the same key go to the same part. """ - def __init__(self, file_path, path_prefix=None, scp_sep=' ', **kwargs): - super().__init__( - file_path, permissive=False, **kwargs) - + def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + super().__init__(file_path, permissive=False, **kwargs) + self.scp = SCPList.load(self.file_path, sep=scp_sep) if self.num_parts > 1: - self.scp = self.scp.split(self.part_idx, self.num_parts, - group_by_key=self.split_by_key) + self.scp = self.scp.split( + self.part_idx, self.num_parts, group_by_key=self.split_by_key + ) if path_prefix is not None: self.scp.add_prefix_to_filepath(path_prefix) - @property def keys(self): return self.scp.key - def reset(self): """Closes all the open hdf5 files and puts the read pointer pointing @@ -323,19 +297,15 @@ def reset(self): self.close() self.cur_item = 0 - - def eof(self): """Returns True when all the elements in the scp have been read.""" return self.cur_item == len(self.scp) - - def read_shapes(self, num_records=0, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - num_records: How many matrices shapes to read, if num_records=0 it + num_records: How many matrices shapes to read, if num_records=0 it reads al the matrices in the dataset. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -346,7 +316,7 @@ def read_shapes(self, num_records=0, assert_same_dim=True): """ if num_records == 0: num_records = len(self.scp) - self.cur_item - + keys = [] shapes = [] for i in range(num_records): @@ -360,33 +330,29 @@ def read_shapes(self, num_records=0, assert_same_dim=True): self._open_archive(file_path) shape_i = self.f[key].shape - shape_i = self._apply_range_to_shape( - shape_i, row_offset_i, num_rows_i) - + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + keys.append(key) shapes.append(shape_i) self.cur_item += 1 - + if assert_same_dim: dims = np.array([s[-1] for s in shapes], dtype=np.int32) assert np.all(dims == dims[0]) - - return keys, shapes - + return keys, shapes - def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): """Reads next num_records feature matrices/vectors. - + Args: num_records: Number of feature matrices to read. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -397,10 +363,12 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): if num_records == 0: num_records = len(self.scp) - self.cur_item - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) keys = [] data = [] @@ -414,49 +382,45 @@ def read(self, num_records=0, squeeze=False, row_offset=0, num_rows=0): row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows row_offset_i, num_rows_i = self._combine_ranges( - range_spec, row_offset_i, num_rows_i) - + range_spec, row_offset_i, num_rows_i + ) + self._open_archive(file_path) - + dset_i = self.f[key] data_i = _read_h5_data(dset_i, row_offset_i, num_rows_i, self.transform) - self.cur_item += 1 + self.cur_item += 1 key = keys.append(key) data.append(data_i) - if squeeze: data = self._squeeze(data) - - return keys, data - + return keys, data class RandomAccessH5DataReader(RandomAccessDataReader): """Abstract base class to read hdf5 feature files in - random order. - - Attributes: - file_path: hdf5 or scp file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file - it returns an empty matrix, if False it raises an exception. + random order. + + Attributes: + file_path: hdf5 or scp file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + permissive: If True, if the data that we want to read is not in the file + it returns an empty matrix, if False it raises an exception. """ - def __init__(self, file_path, transform=None, permissive = False): + def __init__(self, file_path, transform=None, permissive=False): super().__init__(file_path, transform, permissive) self.f = None - - def read_num_rows(self, keys, assert_same_dim=True): """Reads the number of rows in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the number of rows. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -465,17 +429,14 @@ def read_num_rows(self, keys, assert_same_dim=True): Integer numpy array with the number of rows for the recordings in keys. """ shapes = self.read_shapes(keys, assert_same_dim) - num_rows = np.array([s[0] if len(s)==2 else 1 for s in shapes], - dtype=int) + num_rows = np.array([s[0] if len(s) == 2 else 1 for s in shapes], dtype=int) return num_rows - - def read_dims(self, keys, assert_same_dim=True): """Reads the number of columns in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the number of columns. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -486,55 +447,47 @@ def read_dims(self, keys, assert_same_dim=True): shapes = self.read_shapes(keys, False) dims = np.array([s[-1] for s in shapes], dtype=np.int32) if assert_same_dim: - assert np.all(dims==dims[0]) + assert np.all(dims == dims[0]) return dims - - class RandomAccessH5FileDataReader(RandomAccessH5DataReader): """Class to read from a single hdf5 file in random order - - Attributes: - file_path: scp file to read. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file - it returns an empty matrix, if False it raises an exception. + + Attributes: + file_path: scp file to read. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + permissive: If True, if the data that we want to read is not in the file + it returns an empty matrix, if False it raises an exception. """ - + def __init__(self, file_path, **kwargs): super().__init__(file_path, **kwargs) self.lock = multiprocessing.Lock() self._open_archive(file_path) - - def close(self): """Closes the hdf5 files.""" if self.f is not None: self.f.close() self.f = None - - def _open_archive(self, file_path): """Open the hdf5 file it it is not open.""" if self.f is None: self.close() - self.f = h5py.File(file_path, 'r') - + self.f = h5py.File(file_path, "r") @property def keys(self): return list(self.f.keys()) - def read_shapes(self, keys, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the shapes. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -547,13 +500,13 @@ def read_shapes(self, keys, assert_same_dim=True): shapes = [] for key in keys: - + if not (key in self.f): if self.permissive: shapes.append((0,)) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) shape_i = self.f[key].shape shapes.append(shape_i) @@ -561,23 +514,21 @@ def read_shapes(self, keys, assert_same_dim=True): if assert_same_dim: dims = np.array([s[-1] for s in shapes], dtype=np.int32) assert np.all(dims == dims[0]) - - return shapes - + return shapes def read(self, keys, squeeze=False, row_offset=0, num_rows=0): """Reads the feature matrices/vectors for the recordings in keys. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the feature matrices/vectors. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -587,10 +538,12 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: @@ -598,13 +551,13 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data = [] for i, key in enumerate(keys): - + if not (key in self.f): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows @@ -616,47 +569,41 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if squeeze: data = self._squeeze(data, self.permissive) - - return data - + return data class RandomAccessH5ScriptDataReader(RandomAccessH5DataReader): """Class to read multiple hdf5 files in random order, where a scp file - indicates which hdf5 file contains each feature matrix. - - Attributes: - file_path: scp file to read. - path_prefix: If input_spec is a scp file, it pre-appends - path_prefix string to the second column of - the scp file. This is useful when data - is read from a different directory of that - it was created. - transform: TransformList object, applies a transformation to the - features after reading them from disk. - permissive: If True, if the data that we want to read is not in the file - it returns an empty matrix, if False it raises an exception. - scp_sep: Separator for scp files (default ' '). + indicates which hdf5 file contains each feature matrix. + + Attributes: + file_path: scp file to read. + path_prefix: If input_spec is a scp file, it pre-appends + path_prefix string to the second column of + the scp file. This is useful when data + is read from a different directory of that + it was created. + transform: TransformList object, applies a transformation to the + features after reading them from disk. + permissive: If True, if the data that we want to read is not in the file + it returns an empty matrix, if False it raises an exception. + scp_sep: Separator for scp files (default ' '). """ - - def __init__(self, file_path, path_prefix=None, scp_sep=' ', **kwargs): - super().__init__( - file_path, **kwargs) - + + def __init__(self, file_path, path_prefix=None, scp_sep=" ", **kwargs): + super().__init__(file_path, **kwargs) + self.scp = SCPList.load(self.file_path, sep=scp_sep) if path_prefix is not None: self.scp.add_prefix_to_filepath(path_prefix) - archives, archive_idx = np.unique( - self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) - self.locks = [ multiprocessing.Lock() for i in range(len(self.archives)) ] - + self.locks = [multiprocessing.Lock() for i in range(len(self.archives))] - def close(self): """Closes all the open hdf5 files.""" for f in self.f: @@ -664,16 +611,14 @@ def close(self): f.close() self.f = [None] * len(self.f) - @property def keys(self): return self.scp.key - def _open_archive(self, key_idx): - """Opens the hdf5 file correspoding to a given feature/matrix + """Opens the hdf5 file correspoding to a given feature/matrix if it is not already open. - + Args: key_idx: Integer position of the feature matrix in the scp file. @@ -683,17 +628,15 @@ def _open_archive(self, key_idx): archive_idx = self.archive_idx[key_idx] with self.locks[archive_idx]: if self.f[archive_idx] is None: - self.f[archive_idx] = h5py.File(self.archives[archive_idx], 'r') + self.f[archive_idx] = h5py.File(self.archives[archive_idx], "r") return self.f[archive_idx], self.locks[archive_idx] - - def read_shapes(self, keys, assert_same_dim=True): """Reads the shapes in the feature matrices of the dataset. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the shapes. assert_same_dim: If True, it raise exception in not all the matrices have the same number of columns. @@ -703,22 +646,21 @@ def read_shapes(self, keys, assert_same_dim=True): """ if isinstance(keys, str): keys = [keys] - #t1 = time.time() + # t1 = time.time() shapes = [] for key in keys: - + if not (key in self.scp): if self.permissive: shapes.append((0,)) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) index = self.scp.get_index(key) _, file_path, offset, range_spec = self.scp[index] - row_offset_i, num_rows_i = self._combine_ranges( - range_spec, 0, 0) + row_offset_i, num_rows_i = self._combine_ranges(range_spec, 0, 0) f, lock = self._open_archive(index) if not (key in f): @@ -726,35 +668,32 @@ def read_shapes(self, keys, assert_same_dim=True): shapes.append((0,)) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) with lock: shape_i = f[key].shape - shape_i = self._apply_range_to_shape( - shape_i, row_offset_i, num_rows_i) - #print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.)) + shape_i = self._apply_range_to_shape(shape_i, row_offset_i, num_rows_i) + # print('%s %d %.2f' % (key,time.time()-t1, len(shapes)/len(keys)*100.)) shapes.append(shape_i) if assert_same_dim: dims = np.array([s[-1] for s in shapes], dtype=np.int32) assert np.all(dims == dims[0]) - - return shapes - + return shapes def read(self, keys, squeeze=False, row_offset=0, num_rows=0): """Reads the feature matrices/vectors for the recordings in keys. - + Args: - keys: List of recording names from which we want to retrieve the + keys: List of recording names from which we want to retrieve the feature matrices/vectors. - squeeze: If True, it converts the list of + squeeze: If True, it converts the list of matrices/vectors to 3D/2D numpy array. All matrices need to have same number of rows. - offset: List of integers or numpy array of with the first row to + offset: List of integers or numpy array of with the first row to read from each feature matrix. - num_rows: List of integers or numpy array of with the + num_rows: List of integers or numpy array of with the number of rows to read from each feature matrix. If 0 it reads all the rows. @@ -764,24 +703,26 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if isinstance(keys, str): keys = [keys] - row_offset_is_list = (isinstance(row_offset, list) or - isinstance(row_offset, np.ndarray)) - num_rows_is_list = (isinstance(num_rows, list) or - isinstance(num_rows, np.ndarray)) + row_offset_is_list = isinstance(row_offset, list) or isinstance( + row_offset, np.ndarray + ) + num_rows_is_list = isinstance(num_rows, list) or isinstance( + num_rows, np.ndarray + ) if row_offset_is_list: assert len(row_offset) == len(keys) if num_rows_is_list: assert len(num_rows) == len(keys) data = [] - for i,key in enumerate(keys): - + for i, key in enumerate(keys): + if not (key in self.scp): if self.permissive: data.append(np.array([], dtype=float_cpu())) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) index = self.scp.get_index(key) _, file_path, offset, range_spec = self.scp[index] @@ -789,7 +730,8 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): row_offset_i = row_offset[i] if row_offset_is_list else row_offset num_rows_i = num_rows[i] if num_rows_is_list else num_rows row_offset_i, num_rows_i = self._combine_ranges( - range_spec, row_offset_i, num_rows_i) + range_spec, row_offset_i, num_rows_i + ) f, lock = self._open_archive(index) with lock: @@ -798,7 +740,7 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): data.append(np.array([], dtype=float_cpu())) continue else: - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) dset_i = f[key] data_i = _read_h5_data(dset_i, row_offset_i, num_rows_i, self.transform) @@ -807,6 +749,5 @@ def read(self, keys, squeeze=False, row_offset=0, num_rows=0): if squeeze: data = self._squeeze(data, self.permissive) - + return data - diff --git a/hyperion/io/h5_data_writer.py b/hyperion/io/h5_data_writer.py index 679d227d..0685d9b8 100644 --- a/hyperion/io/h5_data_writer.py +++ b/hyperion/io/h5_data_writer.py @@ -16,35 +16,32 @@ class H5DataWriter(DataWriter): """Class to write hdf5 feature files. - + Attributes: archive_path: output data file path. script_path: optional output scp file. flush: If True, it flushes the output after writing each feature file. compress: It True, it uses Kaldi compression. compression_method: Kaldi compression method: - {auto (default), speech_feat, + {auto (default), speech_feat, 2byte-auto, 2byte-signed-integer, 1byte-auto, 1byte-unsigned-integer, 1byte-0-1}. scp_sep: Separator for scp files (default ' '). """ def __init__(self, archive_path, script_path=None, **kwargs): - - super().__init__( - archive_path, script_path, **kwargs) - self.f = h5py.File(archive_path, 'w') + super().__init__(archive_path, script_path, **kwargs) + + self.f = h5py.File(archive_path, "w") if script_path is None: self.f_script = None else: - self.f_script = open(script_path, 'w') - + self.f_script = open(script_path, "w") - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - + with H5DataWriter('file.h5') as f: f.write(key, data) @@ -52,8 +49,6 @@ def __exit__(self, exc_type, exc_value, traceback): """ self.close() - - def close(self): """Closes the output file""" if self.f is not None: @@ -62,15 +57,11 @@ def close(self): if self.f_script is not None: self.f_script.close() - - def flush(self): """Flushes the file""" self.f.flush() if self.f_script is not None: self.f_script.flush() - - def _convert_data(self, data): """Converts data to the format for saving. @@ -85,42 +76,40 @@ def _convert_data(self, data): """ if isinstance(data, np.ndarray): if self.compress: - mat = KaldiCompressedMatrix.compress( - data, self.compression_method) + mat = KaldiCompressedMatrix.compress(data, self.compression_method) return mat.get_data_attrs() else: data = data.astype(float_save(), copy=False) return data, None else: - raise ValueError('Data is not ndarray') - + raise ValueError("Data is not ndarray") - def write(self, keys, data): """Writes data to file. - + Args: key: List of recodings names. - data: List of Feature matrices or vectors. - If all the matrices have the same dimension + data: List of Feature matrices or vectors. + If all the matrices have the same dimension it can be a 3D numpy array. If they are vectors, it can be a 2D numpy array. """ if isinstance(keys, str): keys = [keys] data = [data] - + for i, key_i in enumerate(keys): - assert is_token(key_i), 'Token %s not valid' % key_i + assert is_token(key_i), "Token %s not valid" % key_i data_i, attrs = self._convert_data(data[i]) dset = self.f.create_dataset(key_i, data=data_i) if attrs is not None: for k, v in attrs.items(): dset.attrs[k] = v - + if self.f_script is not None: - self.f_script.write('%s%s%s\n' % ( - key_i, self.scp_sep, self.archive_path)) + self.f_script.write( + "%s%s%s\n" % (key_i, self.scp_sep, self.archive_path) + ) if self._flush: self.flush() diff --git a/hyperion/io/h5_merger.py b/hyperion/io/h5_merger.py index 7dd553ff..f1b408e7 100644 --- a/hyperion/io/h5_merger.py +++ b/hyperion/io/h5_merger.py @@ -11,8 +11,7 @@ class H5Merger(object): - """Merges several hdf5 files into one. - """ + """Merges several hdf5 files into one.""" def __init__(self, input_files, output_file, chunk_size=None): self.input_files = input_files @@ -24,7 +23,6 @@ def merge(self): for h5_file in self.input_files: self._merge_file(hw, h5_file) - def _merge_file(self, hw, input_file): hr = HR(input_file) datasets = hr.get_datasets() @@ -32,12 +30,9 @@ def _merge_file(self, hw, input_file): chunk = len(datasets) else: chunk = self.chunk_size - + for first in range(0, len(datasets), chunk): - last = min(first+chunk, len(datasets)) + last = min(first + chunk, len(datasets)) keys = datasets[first:last] x = hr.read(keys) - hw.write(keys, '', x) - - - + hw.write(keys, "", x) diff --git a/hyperion/io/hyp_data_reader.py b/hyperion/io/hyp_data_reader.py index d3ed0c9f..9219187a 100644 --- a/hyperion/io/hyp_data_reader.py +++ b/hyperion/io/hyp_data_reader.py @@ -15,24 +15,23 @@ class HypDataReader(object): """ Class to read data from hdf5 files (deprecated). """ + def __init__(self, file_path): self.file_path = file_path - self.f = h5py.File(file_path, 'r') - + self.f = h5py.File(file_path, "r") def get_datasets(self): keys = [] for ds in self.f: keys.append(ds) return keys - - def read(self, keys, field='', return_tensor=False): + def read(self, keys, field="", return_tensor=False): if isinstance(keys, list): - datasets = [key+field for key in keys] + datasets = [key + field for key in keys] else: - datasets = keys.astype(np.object)+field - + datasets = keys.astype(np.object) + field + if return_tensor: # we assume that all datasets have a common shape shape_0 = self.f[datasets[0]].shape @@ -40,9 +39,9 @@ def read(self, keys, field='', return_tensor=False): X = np.zeros(shape, dtype=float_cpu()) else: X = [] - + for i in range(len(keys)): - assert datasets[i] in self.f, 'Dataset %s not found' % datasets[i] + assert datasets[i] in self.f, "Dataset %s not found" % datasets[i] X_i = self.f[datasets[i]] if return_tensor: X[i] = X_i @@ -51,52 +50,47 @@ def read(self, keys, field='', return_tensor=False): return X - - def get_num_rows(self, keys, field=''): + def get_num_rows(self, keys, field=""): if isinstance(keys, list): - datasets = [key+field for key in keys] + datasets = [key + field for key in keys] else: - datasets = keys.astype(np.object)+field + datasets = keys.astype(np.object) + field num_ds = len(datasets) num_rows = np.zeros((num_ds,), dtype=int) - - for i,ds in enumerate(datasets): - assert ds in self.f, 'Dataset %s not found' % ds + + for i, ds in enumerate(datasets): + assert ds in self.f, "Dataset %s not found" % ds num_rows[i] = self.f[ds].shape[0] return num_rows - - def read_slice(self, key, index, num_samples, field=''): + def read_slice(self, key, index, num_samples, field=""): dataset = key + field - assert dataset in self.f, 'Dataset %s not found' % dataset - X = self.f[dataset][index:index+num_samples] + assert dataset in self.f, "Dataset %s not found" % dataset + X = self.f[dataset][index : index + num_samples] return X - - def read_random_slice(self, key, num_samples, rng, field=''): + def read_random_slice(self, key, num_samples, rng, field=""): dataset = key + field - assert dataset in self.f, 'Dataset %s not found' % dataset + assert dataset in self.f, "Dataset %s not found" % dataset num_rows = self.f[dataset].shape[0] # print('hola',num_rows,num_samples,num_rows-num_samples) # index = rng.random_integers(low=0, high=num_rows-num_samples, size=1)[0] - index = rng.randint(low=0, high=num_rows-num_samples+1) - X = self.f[dataset][index:index+num_samples] + index = rng.randint(low=0, high=num_rows - num_samples + 1) + X = self.f[dataset][index : index + num_samples] return X, index - - def read_random_samples(self, key, num_samples, rng, field='', replace=True): + def read_random_samples(self, key, num_samples, rng, field="", replace=True): dataset = key + field - assert dataset in self.f, 'Dataset %s not found' % dataset + assert dataset in self.f, "Dataset %s not found" % dataset num_rows = self.f[dataset].shape[0] - index = np.sort(rng.choice(np.arange(num_rows), size=num_samples, replace=replace)) + index = np.sort( + rng.choice(np.arange(num_rows), size=num_samples, replace=replace) + ) min_index = index[0] - max_index = index[-1]+1 + max_index = index[-1] + 1 index -= min_index X = self.f[dataset][min_index:max_index] X = X[index] return X, index - - - diff --git a/hyperion/io/hyp_data_writer.py b/hyperion/io/hyp_data_writer.py index 2ebf070d..9a5b5906 100644 --- a/hyperion/io/hyp_data_writer.py +++ b/hyperion/io/hyp_data_writer.py @@ -10,6 +10,7 @@ from ..hyp_defs import float_save from ..utils.list_utils import list2ndarray, ismember + class HypDataWriter(object): """ Class to write data to hdf5 files (deprecated). @@ -17,16 +18,14 @@ class HypDataWriter(object): def __init__(self, file_path): self.file_path = file_path - self.f = h5py.File(file_path, 'w') - + self.f = h5py.File(file_path, "w") def write(self, keys, field, x): - #datasets = keys.astype(np.object)+field + # datasets = keys.astype(np.object)+field if isinstance(keys, str): keys = [keys] x = [x] - - datasets = [ key+field for key in keys] + + datasets = [key + field for key in keys] for i, ds in enumerate(datasets): self.f.create_dataset(ds, data=x[i].astype(float_save())) - diff --git a/hyperion/io/int32_writer.py b/hyperion/io/int32_writer.py index 575d7a93..c823dc0e 100644 --- a/hyperion/io/int32_writer.py +++ b/hyperion/io/int32_writer.py @@ -7,8 +7,9 @@ from .data_writer import DataWriter + class Int32Writer(DataWriter): - """Class to write data to int32 files. - """ + """Class to write data to int32 files.""" + def __init__(self, wspecifier): super(Int32Writer, self).__init__(wspecifier) diff --git a/hyperion/io/kaldi_data_reader.py b/hyperion/io/kaldi_data_reader.py index a175c976..712941ec 100644 --- a/hyperion/io/kaldi_data_reader.py +++ b/hyperion/io/kaldi_data_reader.py @@ -11,137 +11,135 @@ from ..hyp_defs import float_cpu from ..utils.scp_list import SCPList + class KaldiDataReader(object): - """Class to read features from .ark files. - """ - - def __init__(self, file_path, input_dir=None, sep=' '): - self.file_path = file_path - self.cur_record=0 - - scp = SCPList.load(file_path, sep=sep) - if input_dir is None: - self.scp = OrderedDict((k, v) for (k, v) in - zip(scp.key, scp.file_path)) - else: - input_dir+='/' - self.scp = OrderedDict((k, input_dir+v) for (k, v) in - zip(scp.key, scp.file_path)) - - - def read(self, keys=None, num_records=None, first_record=None, squeeze=False): - if keys is None: - keys=list(self.scp.keys()) - if first_record is not None: - self.cur_record = first_record - - if num_records is None: - keys = keys[self.cur_record:] - self.cur_record = len(keys) - else: - final_record = min(self.cur_record+num_records, len(keys)) - keys = keys[self.cur_record:final_record] - self.cur_record = final_record - - X = [] - for i, key in enumerate(keys): - file_path = self.scp[key] - m = self._read_matrix(file_path) - if squeeze: - m = np.squeeze(m) - X.append(m) - - return X, keys - - - def reset(self): - self.cur_record=0 - - def eof(self): - return self.cur_record == len(self.scp.keys()) - - @staticmethod - def _open(file_path, mode='rb'): - try: - # separate offset from filename (optional), - offset = None - if re.search(':[0-9]+$', file_path): - (file_path, offset) = file_path.rsplit(':',1) - - if file_path.split('.')[-1] == 'gz': - f = gzip.open(file_path, mode) - else: - f = open(file_path, mode) - if offset is not None: - f.seek(int(offset)) - return f - except TypeError: - return file_path - - - @staticmethod - def _read_matrix(f): - # try: - # f = KaldiDataReader._open(f, 'r') - # binary = f.read(2) - # except: - f = KaldiDataReader._open(f, 'rb') - binary = f.read(2) - print('except') - print(binary) - - if binary == b'\0B' : - mat = KaldiDataReader._read_bin_matrix(f) + """Class to read features from .ark files.""" + + def __init__(self, file_path, input_dir=None, sep=" "): + self.file_path = file_path + self.cur_record = 0 + + scp = SCPList.load(file_path, sep=sep) + if input_dir is None: + self.scp = OrderedDict((k, v) for (k, v) in zip(scp.key, scp.file_path)) + else: + input_dir += "/" + self.scp = OrderedDict( + (k, input_dir + v) for (k, v) in zip(scp.key, scp.file_path) + ) + + def read(self, keys=None, num_records=None, first_record=None, squeeze=False): + if keys is None: + keys = list(self.scp.keys()) + if first_record is not None: + self.cur_record = first_record + + if num_records is None: + keys = keys[self.cur_record :] + self.cur_record = len(keys) else: - assert(binary == b' [') - mat = KaldiDataReader._read_ascii_matrix(f) - return mat - - - @staticmethod - def _read_bin_matrix(f): - stype = f.read(3).decode('ascii') - dtype = None - ndim = 0 - if stype[0] == 'F': dtype = 'float32' - if stype[0] == 'D': dtype = 'float64' - if stype[1] == 'V': ndim = 1 - if stype[1] == 'M': ndim = 2 - assert(dtype is not None) - assert(ndim == 1 or ndim==2) - - # Dimensions - if ndim == 2: - f.read(1) - rows = struct.unpack(' max_samples: - logging.warning('Duration of segment %s in segments-file (%d samples) > ' - 'full utterance %s duration (%d)' % ( - segment['segment_id'], num_samples_i, - file_id, max_samples)) + logging.warning( + "Duration of segment %s in segments-file (%d samples) > " + "full utterance %s duration (%d)" + % (segment["segment_id"], num_samples_i, file_id, max_samples) + ) num_samples_i = max_samples else: key_i, file_path, offset, range_spec = self.scp[self.cur_item] num_samples_i = range_spec[1] - + keys.append(key_i) num_samples[i] = num_samples_i self.cur_item += 1 - - return keys, num_samples - + return keys, num_samples def read_time_duration(self, num_records=0): """Reads the duration in secs. of the utterances of the packed audio file - + Args: - num_records: How many utterances to read, if num_records=0 it + num_records: How many utterances to read, if num_records=0 it reads all utterances Returns: @@ -282,7 +276,7 @@ def read_time_duration(self, num_records=0): num_records = len(self.segments) - self.cur_item else: num_records = len(self.scp) - self.cur_item - + keys = [] time_dur = np.zeros((num_records,), dtype=np.float) for i in range(num_records): @@ -292,27 +286,25 @@ def read_time_duration(self, num_records=0): if self.with_segments: segment = self.segments[self.cur_item] - key_i = segment['segment_id'] - t_beg = segment['tbeg'] - t_end = segment['tend'] + key_i = segment["segment_id"] + t_beg = segment["tbeg"] + t_end = segment["tend"] time_dur_i = t_end - t_beg else: key_i, file_path, _, range_spec = self.scp[self.cur_item] self._open_archive(file_path) fs = self.f.samplerate - time_dur_i = range_spec[1]/fs + time_dur_i = range_spec[1] / fs keys.append(key_i) time_dur[i] = time_dur_i self.cur_item += 1 - + return keys, time_dur - - def read(self, num_records=0, time_offset=0, time_durs=0): """Reads next num_records audio files - + Args: num_records: Number of audio files to read. time_offset: List of floats indicating the start time to read in the utterance. @@ -343,23 +335,26 @@ def read(self, num_records=0, time_offset=0, time_durs=0): if self.with_segments: segment = self.segments[self.cur_item] - key = segment['segment_id'] + key = segment["segment_id"] - segment_range_spec = (segment['tbeg'], segment['tend']-segment['tbeg']) + segment_range_spec = ( + segment["tbeg"], + segment["tend"] - segment["tbeg"], + ) offset_i, dur_i = self._combine_ranges( - segment_range_spec, offset_i, dur_i) - file_path, offset, range_spec = self.scp[segment['file_id']] + segment_range_spec, offset_i, dur_i + ) + file_path, offset, range_spec = self.scp[segment["file_id"]] else: key, file_path, offset, range_spec = self.scp[self.cur_item] self._open_archive(file_path) fs_i = self.f.samplerate - offset_i = int(math.floor(offset_i*fs_i)) - dur_i = int(math.floor(dur_i*fs_i)) - offset_i, dur_i = self._combine_ranges( - range_spec, offset_i, dur_i) - - self.f.seek(offset+offset_i) + offset_i = int(math.floor(offset_i * fs_i)) + dur_i = int(math.floor(dur_i * fs_i)) + offset_i, dur_i = self._combine_ranges(range_spec, offset_i, dur_i) + + self.f.seek(offset + offset_i) x_i = self.scale * self.f.read(dur_i, dtype=float_cpu()) keys.append(key) @@ -369,50 +364,50 @@ def read(self, num_records=0, time_offset=0, time_durs=0): return keys, data, fs - @staticmethod def filter_args(**kwargs): - valid_args = ('part_idx', 'num_parts','wav_scale') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("part_idx", "num_parts", "wav_scale") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - + p1 = "--" + prefix + "." + parser.add_argument( - p1+'wav-scale', default=2**15-1, type=float, - help=('multiplicative factor for waveform')) + p1 + "wav-scale", + default=2 ** 15 - 1, + type=float, + help=("multiplicative factor for waveform"), + ) try: parser.add_argument( - p1+'part-idx', type=int, default=1, - help=('splits the list of files in num-parts and ' - 'process part_idx')) + p1 + "part-idx", + type=int, + default=1, + help=("splits the list of files in num-parts and " "process part_idx"), + ) parser.add_argument( - p1+'num-parts', type=int, default=1, - help=('splits the list of files in num-parts and ' - 'process part_idx')) + p1 + "num-parts", + type=int, + default=1, + help=("splits the list of files in num-parts and " "process part_idx"), + ) except: pass - class RandomAccessPackedAudioReader(PackedAudioReader): - - def __init__(self, file_path, segments_path=None, wav_scale=2**15-1): + def __init__(self, file_path, segments_path=None, wav_scale=2 ** 15 - 1): super().__init__(file_path, segments_path, wav_scale) - archives, archive_idx = np.unique( - self.scp.file_path, return_inverse=True) + archives, archive_idx = np.unique(self.scp.file_path, return_inverse=True) self.archives = archives self.archive_idx = archive_idx self.f = [None] * len(self.archives) - self.locks = [ multiprocessing.Lock() for i in range(len(self.archives)) ] - + self.locks = [multiprocessing.Lock() for i in range(len(self.archives))] def close(self): """Closes all the open audio files.""" @@ -421,15 +416,13 @@ def close(self): f.close() self.f = [None] * len(self.f) - - def _open_archive(self, key_idx, offset=None): """Opens the packed audio file correspoding to a given utterance - if it is not already open and moves the file pointer to the + if it is not already open and moves the file pointer to the point where we can read the utterance If the file was already open, it only moves the file pointer. - + Args: key_idx: Integer position of the utterance in the scp file. offset: sample where the utterance starts in the packed audio file. @@ -441,7 +434,7 @@ def _open_archive(self, key_idx, offset=None): archive_idx = self.archive_idx[key_idx] with self.locks[archive_idx]: if self.f[archive_idx] is None: - self.f[archive_idx] = sf.SoundFile(self.archives[archive_idx], 'r') + self.f[archive_idx] = sf.SoundFile(self.archives[archive_idx], "r") f = self.f[archive_idx] if offset is not None: @@ -449,12 +442,9 @@ def _open_archive(self, key_idx, offset=None): return f, self.locks[archive_idx] - - - def read_num_samples(self, keys): """Reads the number of samples in the utterances of the packed audio file - + Args: keys: List of recording/segment_ids names. @@ -469,40 +459,39 @@ def read_num_samples(self, keys): if self.with_segments: if not (key in self.segments): - raise Exception('Key %s not found' % key) - + raise Exception("Key %s not found" % key) + segment = self.segments[key] - file_id = segment['file_id'] - t_beg = segment['tbeg'] - t_end = segment['tend'] - index = self.scp.get_index(segment['file_id']) + file_id = segment["file_id"] + t_beg = segment["tbeg"] + t_end = segment["tend"] + index = self.scp.get_index(segment["file_id"]) _, file_path, offset, range_spec = self.scp[index] f, lock = self._open_archive(index) fs = f.samplerate - num_samples_i = int(math.floor((t_end-t_beg)*fs)) + num_samples_i = int(math.floor((t_end - t_beg) * fs)) max_samples = range_spec[1] if num_samples_i > max_samples: - logging.warning('Duration of segment %s in segments-file (%d samples) > ' - 'full utterance %s duration (%d)' % ( - segment['segment_id'], num_samples_i, - file_id, max_samples)) + logging.warning( + "Duration of segment %s in segments-file (%d samples) > " + "full utterance %s duration (%d)" + % (segment["segment_id"], num_samples_i, file_id, max_samples) + ) num_samples_i = max_samples else: if not (key in self.scp): - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) file_path, offset, range_spec = self.scp[key] num_samples_i = range_spec[1] - + num_samples[i] = num_samples_i return num_samples - - def read_time_duration(self, keys): """Reads the duration in secs. of the utterances of the packed audio file - + Args: keys: List of recording/segment_ids names. @@ -517,30 +506,28 @@ def read_time_duration(self, keys): if self.with_segments: if not (key in self.segments): - raise Exception('Key %s not found' % key) - + raise Exception("Key %s not found" % key) + segment = self.segments[key] - t_beg = segment['tbeg'] - t_end = segment['tend'] + t_beg = segment["tbeg"] + t_end = segment["tend"] time_dur_i = t_end - t_beg else: if not (key in self.scp): - raise Exception('Key %s not found' % key) + raise Exception("Key %s not found" % key) index = self.scp.get_index(key) _, file_path, offset, range_spec = self.scp[index] f, lock = self._open_archive(index) fs = f.samplerate - time_dur_i = range_spec[1]/fs - + time_dur_i = range_spec[1] / fs + time_dur[i] = time_dur_i return time_dur - - def read(self, keys, time_offset=0, time_durs=0): """Reads the waveforms for the recordings in keys. - + Args: keys: List of recording/segment_ids names. time_offset: List of floats indicating the start time to read in the utterance. @@ -561,24 +548,28 @@ def read(self, keys, time_offset=0, time_durs=0): offset_i = time_offset[i] if offset_is_list else time_offset dur_i = time_durs[i] if dur_is_list else time_durs - #t1= time.time() + # t1= time.time() if self.with_segments: if not (key in self.segments): - raise Exception('Key %s not found' % key) - + raise Exception("Key %s not found" % key) + segment = self.segments[key] - segment_range_spec = (segment['tbeg'], segment['tend']-segment['tbeg']) + segment_range_spec = ( + segment["tbeg"], + segment["tend"] - segment["tbeg"], + ) offset_i, dur_i = self._combine_ranges( - segment_range_spec, offset_i, dur_i) - index = self.scp.get_index(segment['file_id']) + segment_range_spec, offset_i, dur_i + ) + index = self.scp.get_index(segment["file_id"]) _, file_path, offset, range_spec = self.scp[index] else: if not (key in self.scp): - raise Exception('Key %s not found' % key) - + raise Exception("Key %s not found" % key) + index = self.scp.get_index(key) _, file_path, offset, range_spec = self.scp[index] - #t2=time.time() + # t2=time.time() # aid = self.archive_idx[index] f, lock = self._open_archive(index) # while lock.locked(): @@ -589,20 +580,19 @@ def read(self, keys, time_offset=0, time_durs=0): # logging.info('checking unlocked {} {} {} {} {} {} {} {}'.format( # index, aid, lock, l, key, offset, offset_i, dur_i)) with lock: - #t3 = time.time() + # t3 = time.time() # logging.info('lock {}'.format(aid)) fs_i = f.samplerate - offset_i = int(math.floor(offset_i*fs_i)) - dur_i = int(math.floor(dur_i*fs_i)) - offset_i, dur_i = self._combine_ranges( - range_spec, offset_i, dur_i) - #t4=time.time() + offset_i = int(math.floor(offset_i * fs_i)) + dur_i = int(math.floor(dur_i * fs_i)) + offset_i, dur_i = self._combine_ranges(range_spec, offset_i, dur_i) + # t4=time.time() cur_pos = f.tell() - f.seek((offset+offset_i-cur_pos), sf.SEEK_CUR) - #t5=time.time() + f.seek((offset + offset_i - cur_pos), sf.SEEK_CUR) + # t5=time.time() x_i = self.scale * f.read(dur_i, dtype=float_cpu()) - #t6=time.time() - #logging.info('time={} {} {} {} {} {}'.format(t6-t1,t2-t1,t3-t2,t4-t3,t5-t4,t6-t5)) + # t6=time.time() + # logging.info('time={} {} {} {} {} {}'.format(t6-t1,t2-t1,t3-t2,t4-t3,t5-t4,t6-t5)) # try: # logging.info('par {} {} {} {} {} {} {} {}'.format( # index, aid, lock, l, key, offset, offset_i, dur_i)) @@ -620,7 +610,7 @@ def read(self, keys, time_offset=0, time_durs=0): # index, aid, lock, l, key, offset, offset_i, dur_i)) # #time.sleep(10) # raise e - + # logging.info('unlock {}'.format(aid)) data.append(x_i) @@ -628,24 +618,24 @@ def read(self, keys, time_offset=0, time_durs=0): return data, fs - @staticmethod def filter_args(**kwargs): - valid_args = ('wav_scale',) + valid_args = ("wav_scale",) - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument( - p1+'wav-scale', default=2**15, type=float, - help=('multiplicative factor for waveform')) + p1 = "--" + prefix + "." + parser.add_argument( + p1 + "wav-scale", + default=2 ** 15, + type=float, + help=("multiplicative factor for waveform"), + ) add_argparse_args = add_class_args diff --git a/hyperion/io/packed_audio_writer.py b/hyperion/io/packed_audio_writer.py index 29e95015..3a15227a 100644 --- a/hyperion/io/packed_audio_writer.py +++ b/hyperion/io/packed_audio_writer.py @@ -15,23 +15,31 @@ class PackedAudioWriter(object): """Class to pack multiple audio files into a single audio file. - It will produce a single audio file (packed oudio file) - plus an scp file with the + It will produce a single audio file (packed oudio file) + plus an scp file with the time-stamps indicating the location of the original files in packed audio file Attributes: audio_path: output data file path. script_path: optional output scp file. - audio_format: audio file format - subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], + audio_format: audio file format + subtype: subtype of audio in [PCM_16, PCM_32, FLOAT, DOUBLE, ...], if None, it uses soundfile defaults (recommended) fs: sampling freq. scp_sep: Separator for scp files (default ' '). """ - def __init__(self, audio_path, script_path=None, - audio_format='wav', audio_subtype=None, - fs=16000, wav_scale=1, scp_sep=' '): + + def __init__( + self, + audio_path, + script_path=None, + audio_format="wav", + audio_subtype=None, + fs=16000, + wav_scale=1, + scp_sep=" ", + ): self.audio_path = audio_path self.script_path = script_path self.audio_format = audio_format @@ -40,7 +48,7 @@ def __init__(self, audio_path, script_path=None, self.wav_scale = wav_scale self.cur_pos = 0 - assert '.' + self.audio_format in valid_ext + assert "." + self.audio_format in valid_ext if audio_subtype is None: self.subtype = sf.default_subtype(self.audio_format) else: @@ -51,49 +59,47 @@ def __init__(self, audio_path, script_path=None, self.audio_dtype = subtype_to_npdtype[self.subtype] if script_path is not None: - self.f_script = open(script_path, 'w') + self.f_script = open(script_path, "w") else: self.f_script = None - - self.f_audio = sf.SoundFile(audio_path, mode='w', samplerate=self.fs, - subtype=self.subtype, format=audio_format, - channels=1) - + self.f_audio = sf.SoundFile( + audio_path, + mode="w", + samplerate=self.fs, + subtype=self.subtype, + format=audio_format, + channels=1, + ) def __enter__(self): """Function required when entering contructions of type - - with PackedAudioWriter('./output_file.flac', './audio_file.scp', - audio_format='flac') as f: - f.write(key, data) + + with PackedAudioWriter('./output_file.flac', './audio_file.scp', + audio_format='flac') as f: + f.write(key, data) """ return self - - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - with PackedAudioWriter('./output_file.flac', './audio_file.scp', - audio_format='flac') as f: - f.write(key, data) - + with PackedAudioWriter('./output_file.flac', './audio_file.scp', + audio_format='flac') as f: + f.write(key, data) + """ self.close() - - def close(self): """Closes the script file if open""" self.f_audio.close() if self.f_script is not None: self.f_script.close() - def write(self, keys, data): """Writes waveform to packed audio file. - + Args: key: List of recodings names. data: List of waveforms @@ -103,46 +109,69 @@ def write(self, keys, data): data = [data] for i, key_i in enumerate(keys): - assert is_token(key_i), 'Token %s not valid' % key_i + assert is_token(key_i), "Token %s not valid" % key_i data_i = data[i] / self.wav_scale data_i = data_i.astype(self.audio_dtype, copy=False) num_samples = len(data_i) self.f_audio.write(data_i) self.f_audio.flush() - + if self.f_script is not None: - self.f_script.write('%s%s%s:%d[0:%d]\n' % ( - key_i, self.scp_sep, self.audio_path, self.cur_pos, num_samples-1)) + self.f_script.write( + "%s%s%s:%d[0:%d]\n" + % ( + key_i, + self.scp_sep, + self.audio_path, + self.cur_pos, + num_samples - 1, + ) + ) self.f_script.flush() self.cur_pos += num_samples - @staticmethod def filter_args(**kwargs): - valid_args = ('output_fs','output_wav_scale', 'output_audio_format', 'output_audio_subtype') - return dict((re.sub('output_','', k), kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ( + "output_fs", + "output_wav_scale", + "output_audio_format", + "output_audio_subtype", + ) + return dict( + (re.sub("output_", "", k), kwargs[k]) for k in valid_args if k in kwargs + ) - @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'output-wav-scale', default=1, type=float, - help=('scale to divide the waveform before writing')) - - parser.add_argument(p1+'output-audio-format', default='flac', - choices=['flac','ogg', 'wav'], - help=('ouput audio format')) - - parser.add_argument(p1+'output-audio-subtype', default=None, - choices=['pcm_16','pcm_24', 'float', 'double', 'vorbis'], - help=('coding format for audio file')) - - parser.add_argument(p1+'output-fs', default=16000, type=int, - help=('output sample frequency')) + p1 = "--" + prefix + "." + + parser.add_argument( + p1 + "output-wav-scale", + default=1, + type=float, + help=("scale to divide the waveform before writing"), + ) + + parser.add_argument( + p1 + "output-audio-format", + default="flac", + choices=["flac", "ogg", "wav"], + help=("ouput audio format"), + ) + + parser.add_argument( + p1 + "output-audio-subtype", + default=None, + choices=["pcm_16", "pcm_24", "float", "double", "vorbis"], + help=("coding format for audio file"), + ) + + parser.add_argument( + p1 + "output-fs", default=16000, type=int, help=("output sample frequency") + ) add_argparse_args = add_class_args diff --git a/hyperion/io/rw_specifiers.py b/hyperion/io/rw_specifiers.py index 66b9fece..37f579b4 100644 --- a/hyperion/io/rw_specifiers.py +++ b/hyperion/io/rw_specifiers.py @@ -8,14 +8,17 @@ import re from enum import Enum + class ArchiveType(Enum): """Types of archive: hdf5, Kaldi Ark or packed-audio files.""" + H5 = 0 ARK = 1 AUDIO = 2 SEGMENT_LIST = 3 RTTM = 4 + """Documentation for "wspecifier" (taken from Kaldi). "wspecifier" describes how we write a set of objects indexed by keys. The basic, unadorned wspecifiers are as follows: @@ -71,13 +74,13 @@ class ArchiveType(Enum): class WSpecType(Enum): """Type of Kaldi stype write specifiers.""" - NO = 0 # No specifier + + NO = 0 # No specifier ARCHIVE = 1 # Specifier contains Ark, hdf5, segment_list or rttm file. - SCRIPT = 2 # Specifier contains scp file. - BOTH = 3 # Specifier contains Ark/hdf5 file and scp file. + SCRIPT = 2 # Specifier contains scp file. + BOTH = 3 # Specifier contains Ark/hdf5 file and scp file. - class WSpecifier(object): """Class to parse Kaldi style write specifier. @@ -98,9 +101,16 @@ class WSpecifier(object): missing scp entries """ - def __init__(self, spec_type, archive, script, - archive_type=ArchiveType.H5, - binary=True, flush=False, permissive=False): + def __init__( + self, + spec_type, + archive, + script, + archive_type=ArchiveType.H5, + binary=True, + flush=False, + permissive=False, + ): self.archive = archive self.script = script self.spec_type = spec_type @@ -109,8 +119,6 @@ def __init__(self, spec_type, archive, script, self.flush = flush self.permissive = permissive - - @classmethod def create(cls, wspecifier): """Creates WSpecifier object from string. @@ -126,13 +134,13 @@ def create(cls, wspecifier): Returns: WSpecifier object. """ - fields = wspecifier.strip().split(':') + fields = wspecifier.strip().split(":") if len(fields) == 1: assert len(fields[0]) > 0 return cls(WSpecType.ARCHIVE, fields[0], None) elif len(fields) == 2: - options = fields[0].strip().split(',') - archives = fields[1].strip().split(',') + options = fields[0].strip().split(",") + archives = fields[1].strip().split(",") archive = None script = None @@ -143,86 +151,81 @@ def create(cls, wspecifier): cur_archive = 0 for option in options: - if option == 'h5': + if option == "h5": assert archive_type is None - assert archive is None, ( - 'Repeated h5, ark in wspecifier %s' % script) + assert archive is None, "Repeated h5, ark in wspecifier %s" % script assert len(archives) > cur_archive archive_type = ArchiveType.H5 archive = archives[cur_archive] cur_archive += 1 - elif option == 'ark': + elif option == "ark": assert archive_type is None - assert archive is None, ( - 'Repeated h5, ark in wspecifier %s' % script) + assert archive is None, "Repeated h5, ark in wspecifier %s" % script assert len(archives) > cur_archive archive_type = ArchiveType.ARK archive = archives[cur_archive] cur_archive += 1 - elif option == 'audio': + elif option == "audio": assert archive_type is None assert archive is None, ( - 'Repeated h5, ark, audio in wspecifier %s' % script) + "Repeated h5, ark, audio in wspecifier %s" % script + ) assert len(archives) > cur_archive archive_type = ArchiveType.AUDIO archive = archives[cur_archive] cur_archive += 1 - elif option == 'scp': - assert script is None, ( - 'Repeated scp in wspecifier %s' % script) + elif option == "scp": + assert script is None, "Repeated scp in wspecifier %s" % script assert len(archives) > cur_archive script = archives[cur_archive] cur_archive += 1 - elif option == 'segments': + elif option == "segments": assert archive_type is None - assert archive is None, ( - 'Repeated h5, ark in wspecifier %s' % script) + assert archive is None, "Repeated h5, ark in wspecifier %s" % script assert len(archives) > cur_archive archive_type = ArchiveType.SEGMENT_LIST archive = archives[cur_archive] cur_archive += 1 - elif option == 'rttm': + elif option == "rttm": assert archive_type is None - assert archive is None, ( - 'Repeated h5, ark in wspecifier %s' % script) + assert archive is None, "Repeated h5, ark in wspecifier %s" % script assert len(archives) > cur_archive archive_type = ArchiveType.RTTM archive = archives[cur_archive] cur_archive += 1 - elif option == 'f': - flush = True - elif option in ['b', 't', 'nf', 'p']: + elif option == "f": + flush = True + elif option in ["b", "t", "nf", "p"]: pass else: - raise ValueError('Wrong wspecifier options %s' - % fields[0]) - + raise ValueError("Wrong wspecifier options %s" % fields[0]) + if archive is None: if script is not None: spec_type = WSpecType.SCRIPT else: - raise ValueError('Wrong wspecifier %s ' % wspecifier) + raise ValueError("Wrong wspecifier %s " % wspecifier) else: if script is None: spec_type = WSpecType.ARCHIVE else: spec_type = WSpecType.BOTH - + if archive_type == ArchiveType.ARK: for option in options: - if option == 't': + if option == "t": binary = False - elif option == 'p': + elif option == "p": permissive = True - - return cls(spec_type, archive, script, - archive_type, binary, flush, permissive) - else: - raise ValueError('Two many fields (%d>2) in wspecifier %s' - % (len(fields), wspecifier)) + return cls( + spec_type, archive, script, archive_type, binary, flush, permissive + ) + else: + raise ValueError( + "Two many fields (%d>2) in wspecifier %s" % (len(fields), wspecifier) + ) - def __eq__(self, other): """Equal operator.""" eq = self.archive == other.archive @@ -233,14 +236,11 @@ def __eq__(self, other): eq = eq and self.flush == other.flush eq = eq and self.permissive == other.permissive return eq - def __ne__(self, other): """Non-equal operator.""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator.""" if self.__eq__(other): @@ -248,8 +248,6 @@ def __cmp__(self, other): return 1 - - """Documentation for "rspecifier" (Taken from Kaldi) "rspecifier" describes how we read a set of objects indexed by keys. The possibilities are: @@ -288,7 +286,8 @@ def __cmp__(self, other): "o, s, p, ark:gunzip -c foo.gz|" """ - + + class RSpecType(Enum): NO = 0 ARCHIVE = 1 @@ -296,10 +295,17 @@ class RSpecType(Enum): class RSpecifier(object): - def __init__(self, spec_type, archive, - archive_type=ArchiveType.H5, - once = False, is_sorted = False, called_sorted = False, - permissive=False, background = False): + def __init__( + self, + spec_type, + archive, + archive_type=ArchiveType.H5, + once=False, + is_sorted=False, + called_sorted=False, + permissive=False, + background=False, + ): self.spec_type = spec_type self.archive = archive @@ -310,21 +316,19 @@ def __init__(self, spec_type, archive, self.permissive = permissive self.background = background - @property def script(self): return self.archive - @classmethod def create(cls, rspecifier): - fields = rspecifier.strip().split(':') + fields = rspecifier.strip().split(":") if len(fields) == 1: assert len(fields[0]) > 0 return cls(RSpecType.ARCHIVE, fields[0]) elif len(fields) == 2: - options = fields[0].strip().split(',') - archives = fields[1].strip().split(',') + options = fields[0].strip().split(",") + archives = fields[1].strip().split(",") assert len(archives) == 1 spec_type = None @@ -337,77 +341,83 @@ def create(cls, rspecifier): background = False for option in options: - if option == 'h5': + if option == "h5": assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.H5 - elif option == 'ark': + elif option == "ark": assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.ARK - elif option == 'audio': + elif option == "audio": assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.AUDIO - elif option == 'segments': + elif option == "segments": assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.SEGMENT_LIST - elif option == 'rttm': + elif option == "rttm": assert spec_type is None spec_type = RSpecType.ARCHIVE archive_type = ArchiveType.RTTM - elif option == 'scp': + elif option == "scp": assert spec_type is None spec_type = RSpecType.SCRIPT - elif option == 'p': + elif option == "p": permissive = True - elif option in ['o', 's', 'cs', 'bg']: + elif option in ["o", "s", "cs", "bg"]: pass else: - raise ValueError('Wrong wspecifier options %s' - % fields[0]) + raise ValueError("Wrong wspecifier options %s" % fields[0]) + + assert spec_type is not None, "Wrong wspecifier options %s" % fields[0] - assert spec_type is not None, ('Wrong wspecifier options %s' - % fields[0]) - if spec_type == RSpecType.SCRIPT: - with open(archive, 'r') as f: - scp_f2 = f.readline().strip().split(' ')[1] - if re.match(r'.*\.h5(?:.[0-9]+:[0-9]+.)?$', scp_f2) is not None: + with open(archive, "r") as f: + scp_f2 = f.readline().strip().split(" ")[1] + if re.match(r".*\.h5(?:.[0-9]+:[0-9]+.)?$", scp_f2) is not None: archive_type = ArchiveType.H5 - elif re.match(r'.*\.ark:.*$', scp_f2) is not None: + elif re.match(r".*\.ark:.*$", scp_f2) is not None: archive_type = ArchiveType.ARK - elif re.match(r'.*[cvg]:[0-9]+.[0-9]+:[0-9]+.$', scp_f2) is not None: + elif ( + re.match(r".*[cvg]:[0-9]+.[0-9]+:[0-9]+.$", scp_f2) is not None + ): archive_type = ArchiveType.AUDIO else: archive_type = ArchiveType.ARK - + # .split('[')[0].split(':') # if len(scp) == 1: # archive_type = ArchiveType.H5 # else: # archive_type = ArchiveType.ARK - + if archive_type == ArchiveType.ARK: for option in options: - if option == 'o': + if option == "o": once = True - elif option == 's': + elif option == "s": is_sorted = True - elif option == 'cs': + elif option == "cs": called_sorted = True - elif option == 'bg': + elif option == "bg": background = True - - return cls(spec_type, archive, archive_type, - once, is_sorted, called_sorted, - permissive, background) + + return cls( + spec_type, + archive, + archive_type, + once, + is_sorted, + called_sorted, + permissive, + background, + ) else: - raise ValueError('Two many fields (%d>2) in wspecifier %s' - % (len(fields), rspecifier)) - - + raise ValueError( + "Two many fields (%d>2) in wspecifier %s" % (len(fields), rspecifier) + ) def __eq__(self, other): eq = self.spec_type == other.spec_type @@ -420,6 +430,5 @@ def __eq__(self, other): eq = eq and self.background == other.background return eq - def __ne__(self, other): return not self.__eq__(other) diff --git a/hyperion/io/segment_vad_reader.py b/hyperion/io/segment_vad_reader.py index 563ac75a..df8d39e5 100644 --- a/hyperion/io/segment_vad_reader.py +++ b/hyperion/io/segment_vad_reader.py @@ -13,40 +13,47 @@ class SegmentVADReader(VADReader): - def __init__(self, segments_file, permissive=False): super().__init__(segments_file, permissive) self.segments = SegmentList.load(segments_file) - def read(self, keys, squeeze=False, offset=0, num_frames=0, - frame_length=25, frame_shift=10, snip_edges=False, - signal_lengths=None): + def read( + self, + keys, + squeeze=False, + offset=0, + num_frames=0, + frame_length=25, + frame_shift=10, + snip_edges=False, + signal_lengths=None, + ): if isinstance(keys, str): keys = [keys] offset_is_list, num_frames_is_list = self._assert_offsets_num_frames( - keys, offset, num_frames) + keys, offset, num_frames + ) vad = [] for i in range(len(keys)): df = self.segments[keys[i]] - ts = np.concatenate((df.tbeg[:,None], df.tend[:,None]), axis=1) + ts = np.concatenate((df.tbeg[:, None], df.tend[:, None]), axis=1) signal_length = None if signal_lengths is None else signal_lengths[i] vad_i = vad_timestamps_to_bin( - ts, frame_length/1000, frame_shift/1000, - snip_edges, signal_length) + ts, frame_length / 1000, frame_shift / 1000, snip_edges, signal_length + ) offset_i = offset[i] if offset_is_list else offset num_frames_i = num_frames[i] if num_frames_is_list else num_frames vad_i = self._get_bin_vad_slice(vad_i, offset_i, num_frames_i) vad.append(vad_i) - + if squeeze: DataReader._squeeze(vad, self.permissive) return vad - def read_timestamps(self, keys, merge_tol=0): if isinstance(keys, str): @@ -55,9 +62,7 @@ def read_timestamps(self, keys, merge_tol=0): ts = [] for i in range(len(keys)): df = self.segments[keys[i]] - ts_i = np.concatenate((df.tbeg[:,None], df.tend[:,None]), axis=1) + ts_i = np.concatenate((df.tbeg[:, None], df.tend[:, None]), axis=1) ts.append(ts_i) return ts - - diff --git a/hyperion/io/vad_reader.py b/hyperion/io/vad_reader.py index f2026470..c56a8ffe 100644 --- a/hyperion/io/vad_reader.py +++ b/hyperion/io/vad_reader.py @@ -11,33 +11,32 @@ class VADReader(object): """Abstract base class to read vad files. - - Attributes: - file_path: h5, ark or scp file to read. - permissive: If True, if the data that we want to read is not in the file - it returns an empty matrix, if False it raises an exception. - + + Attributes: + file_path: h5, ark or scp file to read. + permissive: If True, if the data that we want to read is not in the file + it returns an empty matrix, if False it raises an exception. + """ + def __init__(self, file_path, permissive=False): self.file_path = file_path self.permissive = permissive - def __enter__(self): """Function required when entering contructions of type - with VADReader('file.h5') as f: - keys, data = f.read() + with VADReader('file.h5') as f: + keys, data = f.read() """ return self - def __exit__(self, exc_type, exc_value, traceback): """Function required when exiting from contructions of type - with VADReader('file.h5') as f: - keys, data = f.read() + with VADReader('file.h5') as f: + keys, data = f.read() """ self.close() @@ -45,14 +44,13 @@ def close(self): """Closes input file.""" pass - @staticmethod def _assert_offsets_num_frames(keys, offset, num_frames): n = len(keys) - offset_is_list = (isinstance(offset, list) or - isinstance(offset, np.ndarray)) - num_frames_is_list = (isinstance(num_frames, list) or - isinstance(num_frames, np.ndarray)) + offset_is_list = isinstance(offset, list) or isinstance(offset, np.ndarray) + num_frames_is_list = isinstance(num_frames, list) or isinstance( + num_frames, np.ndarray + ) if offset_is_list: assert len(offset) == n @@ -61,7 +59,6 @@ def _assert_offsets_num_frames(keys, offset, num_frames): return offset_is_list, num_frames_is_list - @staticmethod def _get_bin_vad_slice(vad, offset, num_frames): if offset > 0: @@ -75,5 +72,5 @@ def _get_bin_vad_slice(vad, offset, num_frames): new_vad = np.zeros((num_frames,), dtype=np.bool) new_vad[:n] = vad vad = new_vad - + return vad diff --git a/hyperion/io/vad_rw_factory.py b/hyperion/io/vad_rw_factory.py index af4b0f29..7b855b07 100644 --- a/hyperion/io/vad_rw_factory.py +++ b/hyperion/io/vad_rw_factory.py @@ -11,54 +11,87 @@ class VADReaderFactory(object): - @staticmethod - def create(rspecifier, path_prefix=None, scp_sep=' ', - frame_length=25, frame_shift=10, snip_edges=False): + def create( + rspecifier, + path_prefix=None, + scp_sep=" ", + frame_length=25, + frame_shift=10, + snip_edges=False, + ): if isinstance(rspecifier, str): rspecifier = RSpecifier.create(rspecifier) logging.debug(rspecifier.__dict__) - if rspecifier.spec_type == RSpecType.ARCHIVE: - if (rspecifier.archive_type == ArchiveType.H5 or - rspecifier.archive_type == ArchiveType.ARK): - return BVR(rspecifier, path_prefix, scp_sep, - frame_length=frame_length, frame_shift=frame_shift, - snip_edges=snip_edges) + if rspecifier.spec_type == RSpecType.ARCHIVE: + if ( + rspecifier.archive_type == ArchiveType.H5 + or rspecifier.archive_type == ArchiveType.ARK + ): + return BVR( + rspecifier, + path_prefix, + scp_sep, + frame_length=frame_length, + frame_shift=frame_shift, + snip_edges=snip_edges, + ) if rspecifier.archive_type == ArchiveType.SEGMENT_LIST: return SVR(rspecifier.archive, permissive=rspecifier.permissive) else: - if (rspecifier.archive_type == ArchiveType.H5 or - rspecifier.archive_type == ArchiveType.ARK): - return BVR(rspecifier, path_prefix, scp_sep, - frame_length=frame_length, frame_shift=frame_shift, - snip_edges=snip_edges) - + if ( + rspecifier.archive_type == ArchiveType.H5 + or rspecifier.archive_type == ArchiveType.ARK + ): + return BVR( + rspecifier, + path_prefix, + scp_sep, + frame_length=frame_length, + frame_shift=frame_shift, + snip_edges=snip_edges, + ) @staticmethod def filter_args(**kwargs): - valid_args = ('scp_sep', 'path_prefix', 'frame_shift', 'frame_length', 'snip_edges') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + valid_args = ( + "scp_sep", + "path_prefix", + "frame_shift", + "frame_length", + "snip_edges", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'scp-sep', default=' ', - help=('scp file field separator')) - parser.add_argument(p1+'path-prefix', default=None, - help=('scp file_path prefix')) - parser.add_argument(p1+'frame-shift', default=10, - help=('frame-shift used to compute binary VAD')) - parser.add_argument(p1+'frame-length', default=25, - help=('frame-length used to compute binary VAD')) - parser.add_argument(p1+'snip-edges', default=False, action='store_true', - help=('snip-edges was true when computing VAD')) + p1 = "--" + prefix + "." + parser.add_argument( + p1 + "scp-sep", default=" ", help=("scp file field separator") + ) + parser.add_argument( + p1 + "path-prefix", default=None, help=("scp file_path prefix") + ) + parser.add_argument( + p1 + "frame-shift", + default=10, + help=("frame-shift used to compute binary VAD"), + ) + parser.add_argument( + p1 + "frame-length", + default=25, + help=("frame-length used to compute binary VAD"), + ) + parser.add_argument( + p1 + "snip-edges", + default=False, + action="store_true", + help=("snip-edges was true when computing VAD"), + ) add_argparse_args = add_class_args diff --git a/hyperion/metrics/__init__.py b/hyperion/metrics/__init__.py index c074f3f9..6725621a 100644 --- a/hyperion/metrics/__init__.py +++ b/hyperion/metrics/__init__.py @@ -8,6 +8,3 @@ from .confusion_matrix import * from .eer import compute_eer, compute_prbep from .dcf import compute_dcf, compute_min_dcf, compute_act_dcf, fast_eval_dcf_eer - - - diff --git a/hyperion/metrics/acc.py b/hyperion/metrics/acc.py index 4e662055..daea183e 100644 --- a/hyperion/metrics/acc.py +++ b/hyperion/metrics/acc.py @@ -7,7 +7,6 @@ from sklearn.metrics import accuracy_score - def compute_accuracy(y_true, y_pred, normalize=True, sample_weight=None): """Computes accuracy @@ -16,7 +15,7 @@ def compute_accuracy(y_true, y_pred, normalize=True, sample_weight=None): Ground truth (correct) labels. y_pred: 1d array-like, or label indicator array / sparse matrix. Predicted labels, as returned by a classifier. - normalize: If False, return the number of correctly classified samples. + normalize: If False, return the number of correctly classified samples. Otherwise, return the fraction of correctly classified samples. sample_weight: Sample weights. @@ -24,7 +23,3 @@ def compute_accuracy(y_true, y_pred, normalize=True, sample_weight=None): Accuracy or number of correctly classified samples. """ return accuracy_score(y_true, y_pred, normalize, sample_weight) - - - - diff --git a/hyperion/metrics/cllr.py b/hyperion/metrics/cllr.py index 92ed6bf5..7018d283 100644 --- a/hyperion/metrics/cllr.py +++ b/hyperion/metrics/cllr.py @@ -8,25 +8,26 @@ from ..utils.math import neglogsigmoid from .utils import opt_loglr + def compute_cllr(tar, non): - """ CLLR: Measure of goodness of log-likelihood-ratio detection output. This measure ps both: + """CLLR: Measure of goodness of log-likelihood-ratio detection output. This measure ps both: - The quality of the score (over the whole DET curve), and - - The quality of the calibration + - The quality of the calibration Args: tar: Scores of target trials. non: Scores of non-target trials. - + Returns: CLLR """ - c1 = np.mean(neglogsigmoid(tar))/np.log(2) - c2 = np.mean(neglogsigmoid(non))/np.log(2) + c1 = np.mean(neglogsigmoid(tar)) / np.log(2) + c2 = np.mean(neglogsigmoid(non)) / np.log(2) - return (c1 + c2)/2 + return (c1 + c2) / 2 def compute_min_cllr(tar, non): - tar_llr, non_llr = opt_loglr(tar, non, 'raw') - return compute_cllr(tar_llr,non_llr) + tar_llr, non_llr = opt_loglr(tar, non, "raw") + return compute_cllr(tar_llr, non_llr) diff --git a/hyperion/metrics/confidence.py b/hyperion/metrics/confidence.py index 5ff636b6..708ad243 100644 --- a/hyperion/metrics/confidence.py +++ b/hyperion/metrics/confidence.py @@ -4,4 +4,3 @@ """ import numpy as np - diff --git a/hyperion/metrics/confusion_matrix.py b/hyperion/metrics/confusion_matrix.py index 7c1337fd..2efdd9e4 100644 --- a/hyperion/metrics/confusion_matrix.py +++ b/hyperion/metrics/confusion_matrix.py @@ -11,15 +11,16 @@ from ..utils.list_utils import list2ndarray -def compute_confusion_matrix(y_true, y_pred, labels=None, - normalize=True, sample_weight=None): +def compute_confusion_matrix( + y_true, y_pred, labels=None, normalize=True, sample_weight=None +): """Computes confusion matrix. Args: y_true: Ground truth. y_pred: Estimated labels. - labels: List of labels to index the matrix. This may be used to reorder - or select a subset of labels. If none is given, those that + labels: List of labels to index the matrix. This may be used to reorder + or select a subset of labels. If none is given, those that appear at least once in y_true or y_pred are used in sorted order. sample_weight: Sample weights. @@ -28,24 +29,29 @@ def compute_confusion_matrix(y_true, y_pred, labels=None, """ C = confusion_matrix(y_true, y_pred, labels=labels, sample_weight=sample_weight) if normalize: - C = C/(np.sum(C, axis=1, keepdims=True)+1e-10) + C = C / (np.sum(C, axis=1, keepdims=True) + 1e-10) return C - -def compute_xlabel_confusion_matrix(y_true, y_pred, labels_train=None, labels_test=None, - normalize=True, sample_weight=None): +def compute_xlabel_confusion_matrix( + y_true, + y_pred, + labels_train=None, + labels_test=None, + normalize=True, + sample_weight=None, +): """Computes confusion matrix when the labels used to train the classifier are different than those of the test set. Args: y_true: Ground truth. y_pred: Estimated labels. - labels_train: List of labels used to train the classifier. This may be used to reorder - or select a subset of labels. If none is given, those that + labels_train: List of labels used to train the classifier. This may be used to reorder + or select a subset of labels. If none is given, those that appear at least once in y_pred are used in sorted order. - labels_test: List of labels of the test set. This may be used to reorder - or select a subset of labels. If none is given, those that + labels_test: List of labels of the test set. This may be used to reorder + or select a subset of labels. If none is given, those that appear at least once in y_true are used in sorted order. sample_weight: Sample weights. @@ -64,28 +70,35 @@ def compute_xlabel_confusion_matrix(y_true, y_pred, labels_train=None, labels_te else: labels_test = list2ndarray(labels_test) - assert y_true.dtype == y_pred.dtype, 'y_true and y_pred labels does not have the same type' - assert labels_train.dtype == labels_test.dtype, 'Train and test labels does not have the same type' - assert labels_train.dtype == y_pred.dtype, 'Labels, y_true and y_pred does not have the same type' + assert ( + y_true.dtype == y_pred.dtype + ), "y_true and y_pred labels does not have the same type" + assert ( + labels_train.dtype == labels_test.dtype + ), "Train and test labels does not have the same type" + assert ( + labels_train.dtype == y_pred.dtype + ), "Labels, y_true and y_pred does not have the same type" num_classes_test = len(labels_test) - + if issubclass(y_true.dtype.type, np.integer): y_pred += num_classes_test - elif issubclass(y_true.dtype.type, np.dtype('U')) or issubclass( - y_true.dtype.type, np.dtype('S')): - y_true = np.asarray(['TEST_' + s for s in y_true]) - y_pred = np.asarray(['TRAIN_' + s for s in y_pred]) + elif issubclass(y_true.dtype.type, np.dtype("U")) or issubclass( + y_true.dtype.type, np.dtype("S") + ): + y_true = np.asarray(["TEST_" + s for s in y_true]) + y_pred = np.asarray(["TRAIN_" + s for s in y_pred]) else: raise Exception() - if issubclass(labels_train.dtype.type, np.integer): labels_train += num_classes_test - elif issubclass(labels_train.dtype.type, np.dtype('U')) or issubclass( - labels_train.dtype.type, np.dtype('S')): - labels_test = np.asarray(['TEST_' + s for s in labels_test]) - labels_train = np.asarray(['TRAIN_' + s for s in labels_train]) + elif issubclass(labels_train.dtype.type, np.dtype("U")) or issubclass( + labels_train.dtype.type, np.dtype("S") + ): + labels_test = np.asarray(["TEST_" + s for s in labels_test]) + labels_train = np.asarray(["TRAIN_" + s for s in labels_train]) else: raise Exception() @@ -93,14 +106,18 @@ def compute_xlabel_confusion_matrix(y_true, y_pred, labels_train=None, labels_te C = confusion_matrix(y_true, y_pred, labels=labels, sample_weight=sample_weight) C = C[:num_classes_test, num_classes_test:] if normalize: - C = C/np.sum(C, axis=1, keepdims=True) + C = C / np.sum(C, axis=1, keepdims=True) return C - - -def plot_confusion_matrix(C, labels_true, labels_pred=None, - title='Confusion matrix', cmap=plt.cm.Blues, fmt=None): +def plot_confusion_matrix( + C, + labels_true, + labels_pred=None, + title="Confusion matrix", + cmap=plt.cm.Blues, + fmt=None, +): """Plots a confusion matrix in a figure. Args: @@ -116,7 +133,7 @@ def plot_confusion_matrix(C, labels_true, labels_pred=None, assert C.shape[0] == len(labels_true) assert C.shape[1] == len(labels_pred) - plt.imshow(C, interpolation='nearest', cmap=cmap) + plt.imshow(C, interpolation="nearest", cmap=cmap) plt.title(title) plt.colorbar() tick_marks_y = np.arange(len(labels_true)) @@ -125,19 +142,22 @@ def plot_confusion_matrix(C, labels_true, labels_pred=None, plt.yticks(tick_marks_y, labels_true) if fmt is None: - normalized = np.all(C<=1) - fmt = '.2f' if normalized else 'd' - thresh = np.max(C) / 2. + normalized = np.all(C <= 1) + fmt = ".2f" if normalized else "d" + thresh = np.max(C) / 2.0 for i in range(C.shape[0]): for j in range(C.shape[1]): - plt.text(j, i, format(C[i, j], fmt), - horizontalalignment="center", - color="white" if C[i, j] > thresh else "black") - - plt.tight_layout() - plt.ylabel('True label') - plt.xlabel('Predicted label') + plt.text( + j, + i, + format(C[i, j], fmt), + horizontalalignment="center", + color="white" if C[i, j] > thresh else "black", + ) + plt.tight_layout() + plt.ylabel("True label") + plt.xlabel("Predicted label") def write_confusion_matrix(f, C, labels_true, labels_pred=None, fmt=None): @@ -149,7 +169,7 @@ def write_confusion_matrix(f, C, labels_true, labels_pred=None, fmt=None): labels_true: Labels of the true classes (rows). labels_cols: Labels of the predicted classes. If None, it is equal to labels_true. """ - + if labels_pred is None: labels_pred = labels_true @@ -157,24 +177,23 @@ def write_confusion_matrix(f, C, labels_true, labels_pred=None, fmt=None): assert C.shape[1] == len(labels_pred) if fmt is None: - normalized = np.all(C<=1) - fmt = '.2f' if normalized else 'd' + normalized = np.all(C <= 1) + fmt = ".2f" if normalized else "d" column_width = np.max([len(label) for label in labels_pred] + [6]) + 3 - empty_cell = ' ' * column_width + empty_cell = " " * column_width f.write(empty_cell) for label in labels_pred: - f.write('%{0}s'.format(column_width) % label) - f.write('\n') - + f.write("%{0}s".format(column_width) % label) + f.write("\n") + for i, label_y in enumerate(labels_true): - f.write('%{0}s'.format(column_width) % label_y) + f.write("%{0}s".format(column_width) % label_y) for j in range(C.shape[1]): - f.write('%{0}{1}'.format(column_width, fmt) % C[i, j]) - f.write('\n') - + f.write("%{0}{1}".format(column_width, fmt) % C[i, j]) + f.write("\n") + - def print_confusion_matrix(C, labels_true, labels_pred=None, fmt=None): """Prints confusion matrix to std output. @@ -184,6 +203,3 @@ def print_confusion_matrix(C, labels_true, labels_pred=None, fmt=None): labels_cols: Labels of the predicted classes. If None, it is equal to labels_true. """ write_confusion_matrix(sys.stdout, C, labels_true, labels_pred, fmt) - - - diff --git a/hyperion/metrics/dcf.py b/hyperion/metrics/dcf.py index 5451289f..db840201 100644 --- a/hyperion/metrics/dcf.py +++ b/hyperion/metrics/dcf.py @@ -7,6 +7,7 @@ from .roc import compute_rocch, rocch2eer + def compute_dcf(p_miss, p_fa, prior, normalize=True): """Computes detection cost function DCF = prior*p_miss + (1-prior)*p_fa @@ -24,15 +25,14 @@ def compute_dcf(p_miss, p_fa, prior, normalize=True): prior = np.asarray(prior) if prior.ndim == 1: - prior = prior[:,None] - - dcf = prior * p_miss + (1-prior) * p_fa + prior = prior[:, None] + + dcf = prior * p_miss + (1 - prior) * p_fa if normalize: - dcf /= np.minimum(prior, 1-prior) + dcf /= np.minimum(prior, 1 - prior) return dcf - def compute_min_dcf(tar, non, prior, normalize=True): """Computes minimum DCF min_DCF = min_t prior*p_miss(t) + (1-prior)*p_fa(t) @@ -53,19 +53,18 @@ def compute_min_dcf(tar, non, prior, normalize=True): p_miss, p_fa = compute_rocch(tar, non) dcf = compute_dcf(p_miss, p_fa, prior, normalize) idx_min_dcf = np.argmin(dcf, axis=-1) - if dcf.ndim==1: + if dcf.ndim == 1: min_dcf = dcf[idx_min_dcf] p_miss = p_miss[idx_min_dcf] p_fa = p_fa[idx_min_dcf] else: i1 = np.arange(dcf.shape[0]) - min_dcf = dcf[i1,idx_min_dcf] + min_dcf = dcf[i1, idx_min_dcf] p_miss = p_miss[idx_min_dcf] p_fa = p_fa[idx_min_dcf] return min_dcf, p_miss, p_fa - def compute_act_dcf(tar, non, prior, normalize=True): """Computes actual DCF by making decisions assuming that scores are calibrated to act as log-likelihood ratios. @@ -84,51 +83,52 @@ def compute_act_dcf(tar, non, prior, normalize=True): prior = np.asarray(prior) if prior.ndim == 1: - assert np.all(prior == np.sort(prior, kind='mergesort')), 'priors must be in ascending order' + assert np.all( + prior == np.sort(prior, kind="mergesort") + ), "priors must be in ascending order" else: prior = prior[None] - + num_priors = len(prior) ntar = len(tar) nnon = len(non) - - #thresholds - t = - np.log(prior) + np.log(1-prior) + + # thresholds + t = -np.log(prior) + np.log(1 - prior) ttar = np.concatenate((t, tar)) - ii = np.argsort(ttar, kind='mergesort') - r = np.zeros((num_priors + ntar), dtype='int32') + ii = np.argsort(ttar, kind="mergesort") + r = np.zeros((num_priors + ntar), dtype="int32") r[ii] = np.arange(1, num_priors + ntar + 1) r = r[:num_priors] n_miss = r - np.arange(num_priors, 0, -1) - tnon = np.concatenate((t, non)) - ii = np.argsort(tnon, kind='mergesort') - r = np.zeros((num_priors + nnon), dtype='int32') + ii = np.argsort(tnon, kind="mergesort") + r = np.zeros((num_priors + nnon), dtype="int32") r[ii] = np.arange(1, num_priors + nnon + 1) r = r[:num_priors] n_fa = nnon - r + np.arange(num_priors, 0, -1) # n_miss2 = np.zeros((num_priors,), dtype='int32') # n_fa2 = np.zeros((num_priors,), dtype='int32') - + # for i in range(len(t)): # n_miss2[i] = np.sum(tart[i]) # assert np.all(n_miss2 == n_miss) - # assert np.all(n_fa2 == n_fa) + # assert np.all(n_fa2 == n_fa) # print(n_miss) # print(n_fa) - p_miss = n_miss/ntar - p_fa = n_fa/nnon + p_miss = n_miss / ntar + p_fa = n_fa / nnon - act_dcf = prior * p_miss + (1-prior)*p_fa + act_dcf = prior * p_miss + (1 - prior) * p_fa if normalize: - act_dcf /= np.minimum(prior, 1-prior) + act_dcf /= np.minimum(prior, 1 - prior) if len(act_dcf) == 1: act_dcf = act_dcf[0] @@ -136,7 +136,6 @@ def compute_act_dcf(tar, non, prior, normalize=True): return act_dcf, p_miss, p_fa - def fast_eval_dcf_eer(tar, non, prior, normalize_dcf=True, return_probs=False): """Computes actual DCF, minimum DCF, EER and PRBE all togther @@ -152,7 +151,7 @@ def fast_eval_dcf_eer(tar, non, prior, normalize_dcf=True, return_probs=False): EER value PREBP value """ - + p_miss, p_fa = compute_rocch(tar, non) eer = rocch2eer(p_miss, p_fa) @@ -172,8 +171,3 @@ def fast_eval_dcf_eer(tar, non, prior, normalize_dcf=True, return_probs=False): min_pmiss = p_miss[idx] min_pfa = p_fa[idx] return min_dcf, act_dcf, eer, prbep, min_pmiss, min_pfa, act_pmiss, act_pfa - - - - - diff --git a/hyperion/metrics/det_plot.py b/hyperion/metrics/det_plot.py index 16d50f65..708ad243 100644 --- a/hyperion/metrics/det_plot.py +++ b/hyperion/metrics/det_plot.py @@ -4,5 +4,3 @@ """ import numpy as np - - diff --git a/hyperion/metrics/eer.py b/hyperion/metrics/eer.py index 6103c4a8..afaed124 100644 --- a/hyperion/metrics/eer.py +++ b/hyperion/metrics/eer.py @@ -14,7 +14,7 @@ def compute_eer(tar, non): Args: tar: Scores of target trials. non: Scores of non-target trials. - + Returns: EER """ @@ -22,7 +22,6 @@ def compute_eer(tar, non): return rocch2eer(p_miss, p_fa) - def compute_prbep(tar, non): """Computes precission-recall break-even point where #FA == #Miss @@ -30,7 +29,7 @@ def compute_prbep(tar, non): Args: tar: Scores of target trials. non: Scores of non-target trials. - + Returns: PREBP value """ @@ -38,8 +37,3 @@ def compute_prbep(tar, non): N_miss = p_miss * len(tar) N_fa = p_fa * len(non) return rocch2eer(N_miss, N_fa) - - - - - diff --git a/hyperion/metrics/roc.py b/hyperion/metrics/roc.py index 82b8c782..38e4fa3c 100644 --- a/hyperion/metrics/roc.py +++ b/hyperion/metrics/roc.py @@ -10,8 +10,8 @@ from .utils import pavx -def compute_roc(true_scores, false_scores): - """ Computes the (observed) miss/false_alarm probabilities +def compute_roc(true_scores, false_scores): + """Computes the (observed) miss/false_alarm probabilities for a set of detection output scores. Args: @@ -23,37 +23,36 @@ def compute_roc(true_scores, false_scores): Returns: The miss/false_alarm error probabilities """ - num_true = len(true_scores); - num_false = len(false_scores); - assert(num_true>0) - assert(num_false>0) - - total=num_true+num_false; + num_true = len(true_scores) + num_false = len(false_scores) + assert num_true > 0 + assert num_false > 0 - p_miss = np.zeros((num_true+num_false+1,)); - p_fa = np.zeros((num_true+num_false+1,)); + total = num_true + num_false + + p_miss = np.zeros((num_true + num_false + 1,)) + p_fa = np.zeros((num_true + num_false + 1,)) scores = np.hstack((true_scores, false_scores)) labels = np.zeros_like(scores) labels[:num_true] = 1 - indx = np.argsort(scores, kind='mergesort') + indx = np.argsort(scores, kind="mergesort") labels = labels[indx] - - sumtrue=np.cumsum(labels); - sumfalse=num_false - (np.arange(total) + 1 - sumtrue); - p_miss[0] = 0; - p_fa[0] = 1.0; - p_miss[1:] = sumtrue/num_true; - p_fa[1:] = sumfalse/num_false; + sumtrue = np.cumsum(labels) + sumfalse = num_false - (np.arange(total) + 1 - sumtrue) - return p_miss, p_fa + p_miss[0] = 0 + p_fa[0] = 1.0 + p_miss[1:] = sumtrue / num_true + p_fa[1:] = sumfalse / num_false + return p_miss, p_fa def compute_rocch(tar_scores, non_scores): - """ Computes ROCCH: ROC Convex Hull. + """Computes ROCCH: ROC Convex Hull. Args: tar_scores: scores for target trials @@ -63,86 +62,85 @@ def compute_rocch(tar_scores, non_scores): pmiss and pfa contain the coordinates of the vertices of the ROC Convex Hull. """ - assert(isinstance(tar_scores, np.ndarray)) - assert(isinstance(non_scores, np.ndarray)) - + assert isinstance(tar_scores, np.ndarray) + assert isinstance(non_scores, np.ndarray) + Nt = len(tar_scores) Nn = len(non_scores) - N = Nt+Nn + N = Nt + Nn scores = np.hstack((tar_scores.ravel(), non_scores.ravel())) - #ideal, but non-monotonic posterior + # ideal, but non-monotonic posterior Pideal = np.hstack((np.ones((Nt,)), np.zeros((Nn,)))) - - #It is important here that scores that are the same (i.e. already in order) should NOT be swapped. - #MATLAB's sort algorithm has this property. - perturb = np.argsort(scores, kind='mergesort') - + + # It is important here that scores that are the same (i.e. already in order) should NOT be swapped. + # MATLAB's sort algorithm has this property. + perturb = np.argsort(scores, kind="mergesort") + Pideal = Pideal[perturb] - Popt, width, _ = pavx(Pideal) + Popt, width, _ = pavx(Pideal) nbins = len(width) - p_miss = np.zeros((nbins+1,)) - p_fa = np.zeros((nbins+1,)) + p_miss = np.zeros((nbins + 1,)) + p_fa = np.zeros((nbins + 1,)) - #threshold leftmost: accept eveything, miss nothing - #0 scores to left of threshold - left = 0 + # threshold leftmost: accept eveything, miss nothing + # 0 scores to left of threshold + left = 0 fa = Nn miss = 0 for i in range(nbins): - p_miss[i] = miss/Nt - p_fa[i] = fa/Nn + p_miss[i] = miss / Nt + p_fa[i] = fa / Nn left = left + width[i] miss = np.sum(Pideal[:left]) fa = N - left - np.sum(Pideal[left:]) - p_miss[nbins] = miss/Nt - p_fa[nbins] = fa/Nn + p_miss[nbins] = miss / Nt + p_fa[nbins] = fa / Nn return p_miss, p_fa def rocch2eer(p_miss, p_fa): """Calculates the equal error rate (eer) from pmiss and pfa - vectors. - Note: pmiss and pfa contain the coordinates of the vertices of the - ROC Convex Hull. - Use compute_rocch to convert target and non-target scores to pmiss and - pfa values. + vectors. + Note: pmiss and pfa contain the coordinates of the vertices of the + ROC Convex Hull. + Use compute_rocch to convert target and non-target scores to pmiss and + pfa values. """ eer = 0 - #p_miss and p_fa should be sorted - x = np.sort(p_miss, kind='mergesort') - assert(np.all(x == p_miss)) - x = np.sort(p_fa, kind='mergesort')[::-1] - assert(np.all(x == p_fa)) + # p_miss and p_fa should be sorted + x = np.sort(p_miss, kind="mergesort") + assert np.all(x == p_miss) + x = np.sort(p_fa, kind="mergesort")[::-1] + assert np.all(x == p_fa) _1_1 = np.array([1, -1]) _11 = np.array([[1], [1]]) - for i in range(len(p_fa)-1): - xx = p_fa[i:i+2] - yy = p_miss[i:i+2] - + for i in range(len(p_fa) - 1): + xx = p_fa[i : i + 2] + yy = p_miss[i : i + 2] + XY = np.vstack((xx, yy)).T dd = np.dot(_1_1, XY) - if np.min(np.abs(dd))==0: + if np.min(np.abs(dd)) == 0: eerseg = 0 else: - #find line coefficieents seg s.t. seg'[xx(i)yy(i)] = 1, - #when xx(i),yy(i) is on the line. - seg = sla.solve(XY,_11) - #candidate for EER, eer is highest candidate - eerseg = 1/(np.sum(seg)) - + # find line coefficieents seg s.t. seg'[xx(i)yy(i)] = 1, + # when xx(i),yy(i) is on the line. + seg = sla.solve(XY, _11) + # candidate for EER, eer is highest candidate + eerseg = 1 / (np.sum(seg)) + eer = np.maximum(eer, eerseg) - - return eer + return eer -def filter_roc(p_miss,p_fa): +def filter_roc(p_miss, p_fa): """Removes redundant points from the sequence of points (p_fa,p_miss) so that plotting an ROC or DET curve will be faster. The output ROC curve will be identical to the one plotted from the input @@ -157,23 +155,23 @@ def filter_roc(p_miss,p_fa): Hull. m for misses and fa for false alarms. Returns: new_p_miss, new_p_fa: Vectors containing selected values from the - input vectors. + input vectors. """ out = 0 new_p_miss = np.copy(p_miss) new_p_fa = np.copy(p_fa) - for i in range(1,len(p_miss)): + for i in range(1, len(p_miss)): if p_miss[i] == new_p_miss[out] or p_fa[i] == new_p_fa[out]: continue - + # save previous point, because it is the last point before the # change. On the next iteration, the current point will be saved. - out = out+1 - new_p_miss[out] = p_miss[i-1] - new_p_fa[out] = p_fa[i-1] + out = out + 1 + new_p_miss[out] = p_miss[i - 1] + new_p_fa[out] = p_fa[i - 1] - out = out+1 + out = out + 1 new_p_miss[out] = p_miss[-1] new_p_fa[out] = p_fa[-1] new_p_miss = new_p_miss[:out] @@ -182,10 +180,9 @@ def filter_roc(p_miss,p_fa): return new_p_miss, new_p_fa - def compute_area_under_rocch(p_miss, p_fa): """Calculates area under the ROC convex hull given p_miss, p_fa. - + Args: p_miss: Miss probabilities vector obtained from compute_rocch p_fa: False alarm probabilities vector @@ -194,86 +191,83 @@ def compute_area_under_rocch(p_miss, p_fa): AUC """ - assert np.all(p_miss == np.sort(p_miss, kind='mergesort')) - assert np.all(p_fa[::-1] == np.sort(p_fa, kind='mergesort')) + assert np.all(p_miss == np.sort(p_miss, kind="mergesort")) + assert np.all(p_fa[::-1] == np.sort(p_fa, kind="mergesort")) assert p_miss.shape == p_fa.shape - + auc = 0 for i in range(1, len(p_miss)): - auc += 0.5 * (p_miss[i] - p_miss[i-1]) * (p_fa[i] + p_fa[i+1]) + auc += 0.5 * (p_miss[i] - p_miss[i - 1]) * (p_fa[i] + p_fa[i + 1]) return auc - - def test_roc(): - + plt.figure() - plt.subplot(2,3,1) + plt.subplot(2, 3, 1) tar = np.array([1]) non = np.array([0]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - h1, = plt.plot(pfa,pmiss,'r-^', label='ROCCH',linewidth=2) - h2, = plt.plot(pf,pm,'g--v', label='ROC',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + (h1,) = plt.plot(pfa, pmiss, "r-^", label="ROCCH", linewidth=2) + (h2,) = plt.plot(pf, pm, "g--v", label="ROC", linewidth=2) + plt.axis("square") plt.grid(True) plt.legend(handles=[h1, h2]) - plt.title('2 scores: non < tar') + plt.title("2 scores: non < tar") - plt.subplot(2,3,2) + plt.subplot(2, 3, 2) tar = np.array([0]) non = np.array([1]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g--v',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g--v", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('2 scores: tar < non') - - plt.subplot(2,3,3) + plt.title("2 scores: tar < non") + + plt.subplot(2, 3, 3) tar = np.array([0]) - non = np.array([-1,1]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g--v',linewidth=2) - plt.axis('square') + non = np.array([-1, 1]) + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g--v", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('3 scores: non < tar < non') - - plt.subplot(2,3,4) - tar = np.array([-1,1]) + plt.title("3 scores: non < tar < non") + + plt.subplot(2, 3, 4) + tar = np.array([-1, 1]) non = np.array([0]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g--v',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g--v", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('3 scores: tar < non < tar') - plt.xlabel(r'$P_{fa}$') - plt.ylabel(r'$P_{miss}') - - plt.subplot(2,3,5) - tar = np.random.randn(100)+1 + plt.title("3 scores: tar < non < tar") + plt.xlabel(r"$P_{fa}$") + plt.ylabel(r"$P_{miss}") + + plt.subplot(2, 3, 5) + tar = np.random.randn(100) + 1 non = np.random.randn(100) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('DET') - - plt.subplot(2,3,6) - tar = np.random.randn(100)*2+1 + plt.title("DET") + + plt.subplot(2, 3, 6) + tar = np.random.randn(100) * 2 + 1 non = np.random.randn(100) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('flatter DET') + plt.title("flatter DET") plt.show() - diff --git a/hyperion/metrics/utils.py b/hyperion/metrics/utils.py index 4677f3c5..8a764c3d 100644 --- a/hyperion/metrics/utils.py +++ b/hyperion/metrics/utils.py @@ -11,7 +11,7 @@ def effective_prior(p_tar, c_miss, c_fa): - """This function adjusts a given prior probability of target p_targ, + """This function adjusts a given prior probability of target p_targ, to incorporate the effects of a cost of miss, cmiss, and a cost of false-alarm, cfa. Args: @@ -22,44 +22,42 @@ def effective_prior(p_tar, c_miss, c_fa): Effective prior """ - beta = p_tar*c_miss/(1-p_tar)/c_fa - p_eff = beta/(1+beta) + beta = p_tar * c_miss / (1 - p_tar) / c_fa + p_eff = beta / (1 + beta) return p_eff - - def pavx(y): """PAV: Pool Adjacent Violators algorithm. Non-paramtetric optimization subject to monotonicity. - ghat = pav(y) - fits a vector ghat with nondecreasing components to the - data vector y such that sum((y - ghat).^2) is minimal. - (Pool-adjacent-violators algorithm). - - Author: This code is and adaptation from Bosaris Toolkit and - it is a simplified version of the 'IsoMeans.m' code made available - by Lutz Duembgen at: - http://www.imsv.unibe.ch/~duembgen/software - - Args: - y: uncalibrated scores - - Returns: - Calibrated scores - Width of pav bins, from left to right - (the number of bins is data dependent) - Height: corresponding heights of bins (in increasing order) - - """ + ghat = pav(y) + fits a vector ghat with nondecreasing components to the + data vector y such that sum((y - ghat).^2) is minimal. + (Pool-adjacent-violators algorithm). + + Author: This code is and adaptation from Bosaris Toolkit and + it is a simplified version of the 'IsoMeans.m' code made available + by Lutz Duembgen at: + http://www.imsv.unibe.ch/~duembgen/software + + Args: + y: uncalibrated scores + + Returns: + Calibrated scores + Width of pav bins, from left to right + (the number of bins is data dependent) + Height: corresponding heights of bins (in increasing order) + + """ assert isinstance(y, np.ndarray) n = len(y) - assert n>0 + assert n > 0 index = np.zeros(y.shape, dtype=int) l = np.zeros(y.shape, dtype=int) - # An interval of indices is represented by its left endpoint - # ("index") and its length "len" + # An interval of indices is represented by its left endpoint + # ("index") and its length "len" ghat = np.zeros_like(y) ci = 0 @@ -70,22 +68,20 @@ def pavx(y): # ghat[ci] is the mean of y-values within this interval. for j in range(1, n): # a new index intervall, {j}, is created: - ci = ci+1 + ci = ci + 1 index[ci] = j l[ci] = 1 ghat[ci] = y[j] - #while ci >= 1 and ghat[np.maximum(ci-1,0)] >= ghat[ci]: - while ci >= 1 and ghat[ci-1] >= ghat[ci]: + # while ci >= 1 and ghat[np.maximum(ci-1,0)] >= ghat[ci]: + while ci >= 1 and ghat[ci - 1] >= ghat[ci]: # "pool adjacent violators": - nw = l[ci-1] + l[ci] - ghat[ci-1] = ghat[ci-1] + (l[ci] / nw) * ( - ghat[ci] - ghat[ci-1]) - l[ci-1] = nw - ci = ci-1 + nw = l[ci - 1] + l[ci] + ghat[ci - 1] = ghat[ci - 1] + (l[ci] / nw) * (ghat[ci] - ghat[ci - 1]) + l[ci - 1] = nw + ci = ci - 1 - - height = np.copy(ghat[:ci+1]) - width = l[:ci+1] + height = np.copy(ghat[: ci + 1]) + width = l[: ci + 1] # Now define ghat for all indices: while n >= 1: @@ -93,15 +89,14 @@ def pavx(y): ghat[j] = ghat[ci] n = index[ci] - ci = ci-1 + ci = ci - 1 return ghat, width, height +def opt_loglr(tar, non, method="laplace"): + """Non-parametric optimization of score to log-likelihood-ratio mapping. -def opt_loglr(tar, non, method='laplace'): - """Non-parametric optimization of score to log-likelihood-ratio mapping. - Taken from Bosaris toolkit. Niko Brummer and Johan du Preez, Application-Independent Evaluation of Speaker Detection, Computer Speech and Language, 2005 @@ -109,46 +104,46 @@ def opt_loglr(tar, non, method='laplace'): tar: target scores. non: non-target scores. method: laplace(default, avoids inf log-LR)/raw - + Returns: Calibrated tar and non-tar log-LR """ ntar = len(tar) nnon = len(non) - n = ntar+nnon + n = ntar + nnon scores = np.concatenate((tar, non)) p_ideal = np.zeros((n,), dtype=float_cpu()) p_ideal[:ntar] = 1 - sort_idx = np.argsort(scores, kind='mergesort') + sort_idx = np.argsort(scores, kind="mergesort") # print(scores) # print(sort_idx) p_ideal = p_ideal[sort_idx] - if method == 'laplace': - # The extra targets and non-targets at scores of -inf and +inf effectively - # implement Laplace's rule of succession to avoid log LRs of infinite magnitudes. - p_ideal = np.concatenate(([1,0], p_ideal, [1,0])) + if method == "laplace": + # The extra targets and non-targets at scores of -inf and +inf effectively + # implement Laplace's rule of succession to avoid log LRs of infinite magnitudes. + p_ideal = np.concatenate(([1, 0], p_ideal, [1, 0])) - p_opt,_,_ = pavx(p_ideal) + p_opt, _, _ = pavx(p_ideal) - if method == 'laplace': + if method == "laplace": p_opt = p_opt[2:-2] # Posterior to loglr - # This LR is prior-independent in the sense that if we weight the data with a synthetic prior, - # it makes no difference to the optimizing LR mapping. - # (A synthetic prior DOES change Popt: The posterior log-odds changes by an additive term. But this + # This LR is prior-independent in the sense that if we weight the data with a synthetic prior, + # it makes no difference to the optimizing LR mapping. + # (A synthetic prior DOES change Popt: The posterior log-odds changes by an additive term. But this # this cancels again when converting to log LR. ) # print(p_opt) - post_log_odds = np.log(p_opt) - np.log(1-p_opt) - prior_log_odds = np.log(ntar/nnon) + post_log_odds = np.log(p_opt) - np.log(1 - p_opt) + prior_log_odds = np.log(ntar / nnon) llr = post_log_odds - prior_log_odds - llr += 1e-6 * np.arange(n)/n + llr += 1e-6 * np.arange(n) / n llr[sort_idx] = llr tar_llr = llr[:ntar] non_llr = llr[ntar:] - + return tar_llr, non_llr diff --git a/hyperion/metrics/verification_evaluator.py b/hyperion/metrics/verification_evaluator.py index 7f7811d3..d2b26ed6 100644 --- a/hyperion/metrics/verification_evaluator.py +++ b/hyperion/metrics/verification_evaluator.py @@ -11,9 +11,10 @@ import copy import matplotlib -matplotlib.use('Agg') -matplotlib.rc('font',**{'family':'sans-serif','sans-serif':['Helvetica']}) -matplotlib.rc('text', usetex=True) + +matplotlib.use("Agg") +matplotlib.rc("font", **{"family": "sans-serif", "sans-serif": ["Helvetica"]}) +matplotlib.rc("text", usetex=True) import matplotlib.pyplot as plt from ..hyp_defs import float_cpu @@ -22,6 +23,7 @@ from .utils import effective_prior from .dcf import fast_eval_dcf_eer + class VerificationEvaluator(object): """Class computes performance metrics for verification problems. Same metrics can be obtained from fast_eval_dcf_eer functions @@ -32,22 +34,23 @@ class VerificationEvaluator(object): p_tar: target prior float or list/nparray sorted in ascending order c_miss: cost of miss c_fa: cost of false alarm - + """ + def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): if isinstance(key, str): - logging.info('Load key: %s' % key) + logging.info("Load key: %s" % key) key = TrialKey.load(key) if isinstance(scores, str): - logging.info('Load scores: %s' % scores) + logging.info("Load scores: %s" % scores) scores = TrialScores.load(scores) self.key = key self.scores = scores.align_with_ndx(key) - #compute effective prior is c_miss and c_fa are given + # compute effective prior is c_miss and c_fa are given if isinstance(p_tar, float): p_tar = [p_tar] @@ -59,20 +62,19 @@ def __init__(self, key, scores, p_tar, c_miss=None, c_fa=None): self.p_tar = p_tar - def compute_dcf_eer(self, return_df=False): """ Computes DCF/EER - + Args: return_df: if True, it returns the result in a pandas DataFrame object. Returns: min_dcf, act_dcf, eer tuple or pandas DataFrame """ - logging.info('separating tar/non') + logging.info("separating tar/non") tar, non = self.scores.get_tar_non(self.key) - logging.info('computing EER/DCF') + logging.info("computing EER/DCF") min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, self.p_tar) if not return_df: @@ -83,60 +85,64 @@ def compute_dcf_eer(self, return_df=False): min_dcf = [min_dcf] act_dcf = [act_dcf] - df = pd.DataFrame({'eer': eer}) + df = pd.DataFrame({"eer": eer}) for i in range(len(min_dcf)): pi = self.p_tar[i] - df['min-dcf-%.3f' % (pi)] = min_dcf[i] - df['act-dcf-%.3f' % (pi)] = act_dcf[i] + df["min-dcf-%.3f" % (pi)] = min_dcf[i] + df["act-dcf-%.3f" % (pi)] = act_dcf[i] return df - - class VerificationAdvAttackEvaluator(VerificationEvaluator): - """Class computes performance metrics for verification problems + """Class computes performance metrics for verification problems under adversarial attacks Attributes: key: TrialKey object or file_name. scores: TrialScores object or file_name for the system without attack - attack_scores: TrialScores list or file_name list for the system under attack, + attack_scores: TrialScores list or file_name list for the system under attack, each element of the list are the scores for a different attack for the same trial list. For example, FGSM attacks for several eps values. - attack_stats: Pandas DataFrame or file_name list, one for each element in attack scores. + attack_stats: Pandas DataFrame or file_name list, one for each element in attack scores. The dataframe contain statistics about the adversarial signals like snr, linf, etc. p_tar: target prior float or list/nparray sorted in ascending order c_miss: cost of miss c_fa: cost of false alarm - + """ - def __init__(self, key, scores, attack_scores, attack_stats, - p_tar, c_miss=None, c_fa=None): + + def __init__( + self, key, scores, attack_scores, attack_stats, p_tar, c_miss=None, c_fa=None + ): super(VerificationAdvAttackEvaluator, self).__init__( - key, scores, p_tar, c_miss, c_fa) + key, scores, p_tar, c_miss, c_fa + ) if not isinstance(attack_scores, list): attack_scores = [attack_scores] if not isinstance(attack_stats, list): attack_stats = [attack_stats] - assert len(attack_scores) == len(attack_stats), ( - 'num_attack_scores({}) != num_attack_stats({})'.format( - len(attack_scores), len(attack_stats))) + assert len(attack_scores) == len( + attack_stats + ), "num_attack_scores({}) != num_attack_stats({})".format( + len(attack_scores), len(attack_stats) + ) if isinstance(attack_scores[0], str): l = [] for file_path in attack_scores: - logging.info('Load attack scores: %s' % file_path) + logging.info("Load attack scores: %s" % file_path) scores = TrialScores.load(file_path) l.append(scores) attack_scores = l - #align attack scores to key + # align attack scores to key attack_scores_mat = np.zeros( - (len(attack_scores), self.key.num_models, self.key.num_tests), - dtype=float_cpu()) + (len(attack_scores), self.key.num_models, self.key.num_tests), + dtype=float_cpu(), + ) for i, s in enumerate(attack_scores): s = s.align_with_ndx(self.key) @@ -145,7 +151,7 @@ def __init__(self, key, scores, attack_scores, attack_stats, if isinstance(attack_stats[0], str): l = [] for file_path in attack_stats: - logging.info('Load attack stats: %s' % file_path) + logging.info("Load attack stats: %s" % file_path) scores = TrialStats.load(file_path) l.append(scores) attack_stats = l @@ -156,16 +162,14 @@ def __init__(self, key, scores, attack_scores, attack_stats, self._last_stat_name = None self._last_stats_mat = None - @property def num_attacks(self): return self.attack_scores.shape[0] - @staticmethod def _sort_stats_bins(stat_bins, higher_better): - """Sorts the statistics from best to worst, - e.g., for snr higher is better, + """Sorts the statistics from best to worst, + e.g., for snr higher is better, for perturbation linf, lower is better Args: @@ -180,14 +184,13 @@ def _sort_stats_bins(stat_bins, higher_better): stat_bins = stat_bins[::-1] return stat_bins - def _get_stats_mat(self, stat_name): - """Gets the statistics (SNR, Linf) in matrix format aligned with + """Gets the statistics (SNR, Linf) in matrix format aligned with the score matrix. Args: stat_name: name of the statistic matching the column name in pandas DataFrame. - + Returns: Stats matrix aligned with TrialKey """ @@ -195,30 +198,35 @@ def _get_stats_mat(self, stat_name): return self._last_stats_mat stats_mat = np.zeros( - (self.num_attacks, self.key.num_models, self.key.num_tests), - dtype=float_cpu()) + (self.num_attacks, self.key.num_models, self.key.num_tests), + dtype=float_cpu(), + ) for i in range(self.num_attacks): stats_mat[i] = self.attack_stats[i].get_stats_mat(stat_name, self.key) - self.attack_stats[i].reset_stats_mats() # release some mem - + self.attack_stats[i].reset_stats_mats() # release some mem + self._last_stat_name = stat_name self._last_stats_mat = stats_mat return self._last_stats_mat - - def compute_dcf_eer_vs_stats(self, stat_name, stat_bins, - attacked_trials='all', higher_better=False, - return_df=False): + def compute_dcf_eer_vs_stats( + self, + stat_name, + stat_bins, + attacked_trials="all", + higher_better=False, + return_df=False, + ): """ Computes DCF/EER versus SNR/Linf/etc curves - + Args: stat_name: stat name for x-axis matching pandas DataFrame column name. stat_bins: bins to sweep to generate the curve attacked_trials: str in ['all', 'tar', 'non'] indicating if we want curves where we attack all trials, only targets or only nontargets - higher_better: Indicates if the stat_name (x-axis) is better if is high. + higher_better: Indicates if the stat_name (x-axis) is better if is high. True for SNR, false for Linf,L2,... return_df: if True, it returns the result in a pandas DataFrame object. @@ -229,9 +237,9 @@ def compute_dcf_eer_vs_stats(self, stat_name, stat_bins, # sort stats bins from best to worse stat_bins = self._sort_stats_bins(stat_bins, higher_better) - if attacked_trials == 'all': + if attacked_trials == "all": mask = np.logical_or(self.key.tar, self.key.non) - elif attacked_trials == 'tar': + elif attacked_trials == "tar": mask = self.key.tar else: mask = self.key.non @@ -245,35 +253,34 @@ def compute_dcf_eer_vs_stats(self, stat_name, stat_bins, act_dcf = np.zeros((num_bins, len(self.p_tar)), dtype=float_cpu()) if higher_better: - cmp_func = lambda x,y: np.logical_and(np.greater_equal(x,y), mask) + cmp_func = lambda x, y: np.logical_and(np.greater_equal(x, y), mask) sort_func = lambda x: np.argmin(x) else: - cmp_func = lambda x,y: np.logical_and(np.less_equal(x,y), mask) + cmp_func = lambda x, y: np.logical_and(np.less_equal(x, y), mask) sort_func = lambda x: np.argmax(x) scores_attack = copy.deepcopy(self.scores) - print(np.max(stats_mat, axis=(1,2))) + print(np.max(stats_mat, axis=(1, 2))) for b in range(num_bins): # we initialize the score matrix with non-attack scores scores = copy.copy(self.scores.scores) - #find attack scores that meet the bin criteria + # find attack scores that meet the bin criteria score_mask = cmp_func(stats_mat, stat_bins[b]) - print(b,np.sum(score_mask, axis=(1,2))) - + print(b, np.sum(score_mask, axis=(1, 2))) + if self.num_attacks == 1: scores[score_mask[0]] = self.attack_scores[score_mask] else: for i in range(scores.shape[0]): for j in range(scores.shape[1]): - mask_ij = score_mask[:,i,j] + mask_ij = score_mask[:, i, j] if np.any(mask_ij): k = sort_func(stats_mat[mask_ij, i, j]) - scores[i,j] = self.attack_scores[k, i, j] + scores[i, j] = self.attack_scores[k, i, j] scores_attack.scores = scores tar, non = scores_attack.get_tar_non(self.key) - min_dcf_b, act_dcf_b, eer_b, _ = fast_eval_dcf_eer( - tar, non, self.p_tar) + min_dcf_b, act_dcf_b, eer_b, _ = fast_eval_dcf_eer(tar, non, self.p_tar) eer[b] = eer_b min_dcf[b] = min_dcf_b act_dcf[b] = act_dcf_b @@ -281,25 +288,31 @@ def compute_dcf_eer_vs_stats(self, stat_name, stat_bins, if not return_df: return stat_bins, min_dcf, act_dcf, eer - df = pd.DataFrame({stat_name: stat_bins, - 'eer': eer}) + df = pd.DataFrame({stat_name: stat_bins, "eer": eer}) for i in range(min_dcf.shape[1]): pi = self.p_tar[i] - df['min-dcf-%.3f' % (pi)] = min_dcf[:,i] - df['act-dcf-%.3f' % (pi)] = act_dcf[:,i] + df["min-dcf-%.3f" % (pi)] = min_dcf[:, i] + df["act-dcf-%.3f" % (pi)] = act_dcf[:, i] return df - - def find_best_attacks(self, stat_name, attacked_trials, - num_best=10, min_delta=1, attack_idx=0, - threshold=None, prior_idx=0, higher_better=False, - return_df=False): + def find_best_attacks( + self, + stat_name, + attacked_trials, + num_best=10, + min_delta=1, + attack_idx=0, + threshold=None, + prior_idx=0, + higher_better=False, + return_df=False, + ): """ Find the best attacks from the point of view of some of the stats. E.g., Attacks with best SNR or with lowest Linf. - + Args: stat_name: stat name for x-axis matching pandas DataFrame column name. attacked_trials: str in ['all', 'tar', 'non'] indicating if we want curves where @@ -309,7 +322,7 @@ def find_best_attacks(self, stat_name, attacked_trials, score>threshold+min_delta for non-targets threshold: decision threshold, if None, it uses -logit(p_tar) prior_idx: indicates whichi of the priors in the p_tar array to use to compute threshold. - higher_better: Indicates if the stat_name (x-axis) is better if is high. + higher_better: Indicates if the stat_name (x-axis) is better if is high. True for SNR, false for Linf,L2,... return_df: if True, it returns the result in a pandas DataFrame object. @@ -317,24 +330,28 @@ def find_best_attacks(self, stat_name, attacked_trials, modelid, test_segmentid, original_scores, adversarial_scores, stat_values arrays or pandas DataFrame """ - + if threshold is None: prior = self.p_tar[prior_idx] - threshold = -np.log(prior/(1-prior)) - + threshold = -np.log(prior / (1 - prior)) + scores = self.scores.scores attack_scores = self.attack_scores[attack_idx] - if attacked_trials == 'tar': + if attacked_trials == "tar": success_mask = np.logical_and( - np.logical_and(self.key.tar, scores>threshold), - np.logical_and(attack_scores min_delta)) + np.logical_and(self.key.tar, scores > threshold), + np.logical_and( + attack_scores < threshold, scores - attack_scores > min_delta + ), + ) else: success_mask = np.logical_and( - np.logical_and(self.key.non, scoresthreshold, - attack_scores-scores > min_delta)) - + np.logical_and(self.key.non, scores < threshold), + np.logical_and( + attack_scores > threshold, attack_scores - scores > min_delta + ), + ) + if not np.any(success_mask): return None @@ -342,15 +359,13 @@ def find_best_attacks(self, stat_name, attacked_trials, sorted_stats = np.sort(stats_mat[success_mask]) if higher_better: sorted_stats = sorted_stats[::-1] - + num_best = min(len(sorted_stats), num_best) - stats_threshold = sorted_stats[num_best-1] + stats_threshold = sorted_stats[num_best - 1] if higher_better: - success_mask = np.logical_and( - success_mask, stats_mat >= stats_threshold) + success_mask = np.logical_and(success_mask, stats_mat >= stats_threshold) else: - success_mask = np.logical_and( - success_mask, stats_mat <= stats_threshold) + success_mask = np.logical_and(success_mask, stats_mat <= stats_threshold) rmodelid = [] rsegmentid = [] @@ -359,35 +374,47 @@ def find_best_attacks(self, stat_name, attacked_trials, rstat = np.zeros((num_best,), dtype=float_cpu()) k = 0 nz = success_mask.nonzero() - for i,j in zip(nz[0], nz[1]): + for i, j in zip(nz[0], nz[1]): rmodelid.append(self.key.model_set[i]) rsegmentid.append(self.key.seg_set[j]) - rscores[k] = scores[i,j] - rascores[k] = attack_scores[i,j] - rstat[k] = stats_mat[i,j] + rscores[k] = scores[i, j] + rascores[k] = attack_scores[i, j] + rstat[k] = stats_mat[i, j] k += 1 if k == num_best: - break + break if not return_df: return rmodelid, rsegmentid, rscores, rascores, rstat print(rmodelid, rsegmentid, rscores, rascores, rstat) - df = pd.DataFrame({'modelid': rmodelid, - 'segmentid': rsegmentid, - 'scores': rscores, - 'attack-scores': rascores, - stat_name: rstat}) + df = pd.DataFrame( + { + "modelid": rmodelid, + "segmentid": rsegmentid, + "scores": rscores, + "attack-scores": rascores, + stat_name: rstat, + } + ) return df - - def save_best_attacks(self, file_path, stat_name, attacked_trials, - num_best=10, min_delta=1, attack_idx=0, - threshold=None, prior_idx=0, higher_better=False): + def save_best_attacks( + self, + file_path, + stat_name, + attacked_trials, + num_best=10, + min_delta=1, + attack_idx=0, + threshold=None, + prior_idx=0, + higher_better=False, + ): """ Find the best attacks from the point of view of some of the stats. E.g., Attacks with best SNR or with lowest Linf and saves to csv file - + Args: stat_name: stat name for x-axis matching pandas DataFrame column name. attacked_trials: str in ['all', 'tar', 'non'] indicating if we want curves where @@ -397,54 +424,71 @@ def save_best_attacks(self, file_path, stat_name, attacked_trials, score>threshold+min_delta for non-targets threshold: decision threshold, if None, it uses -logit(p_tar) prior_idx: indicates whichi of the priors in the p_tar array to use to compute threshold. - higher_better: Indicates if the stat_name (x-axis) is better if is high. + higher_better: Indicates if the stat_name (x-axis) is better if is high. True for SNR, false for Linf,L2,... """ df = self.find_best_attacks( - stat_name, attacked_trials, num_best, min_delta, attack_idx, - threshold, prior_idx, higher_better, return_df=True) + stat_name, + attacked_trials, + num_best, + min_delta, + attack_idx, + threshold, + prior_idx, + higher_better, + return_df=True, + ) if df is None: return df.to_csv(file_path) - @staticmethod def _process_perf_name(name): - m=re.match(r'eer', name) - if m is not None: - return 0, 'EER(\%)' + m = re.match(r"eer", name) + if m is not None: + return 0, "EER(\%)" - m = re.match(r'min-dcf', name) + m = re.match(r"min-dcf", name) if m is not None: - last=m.span()[1] - if len(name[last:])==0: - return 1, 'MinDCF' + last = m.span()[1] + if len(name[last:]) == 0: + return 1, "MinDCF" else: - p=float(name[last+1:]) - return 1, 'MinDCF(p=%.3f)' % (p) + p = float(name[last + 1 :]) + return 1, "MinDCF(p=%.3f)" % (p) - - m = re.match(r'act-dcf', name) + m = re.match(r"act-dcf", name) if m is not None: - last=m.span()[1] - if len(name[last:])==0: - return 2, 'ActDCF' + last = m.span()[1] + if len(name[last:]) == 0: + return 2, "ActDCF" else: - p=float(name[last+1:]) - return 2, 'ActDCF(p=%.3f)' % (p) - - + p = float(name[last + 1 :]) + return 2, "ActDCF(p=%.3f)" % (p) @staticmethod def plot_dcf_eer_vs_stat_v1( - df, stat_name, output_path, - eer_max=50., min_dcf_max=1., act_dcf_max=1., log_x=False, - clean_ref=None, file_format='pdf', xlabel='', higher_better=False, - legends=None, title=None, fmt=['b','r','g','m','c','y'], - legend_loc='upper left', - legend_font='medium', font_size=10, colors=None): + df, + stat_name, + output_path, + eer_max=50.0, + min_dcf_max=1.0, + act_dcf_max=1.0, + log_x=False, + clean_ref=None, + file_format="pdf", + xlabel="", + higher_better=False, + legends=None, + title=None, + fmt=["b", "r", "g", "m", "c", "y"], + legend_loc="upper left", + legend_font="medium", + font_size=10, + colors=None, + ): """Plot EER/MinDCF/ActDCF versus stat (SNR, Linf) with matplotlib and save figs to file. Args: @@ -455,7 +499,7 @@ def plot_dcf_eer_vs_stat_v1( min_dcf: y axis maximum value for MinDCF act_dcf: y axis maximum value for ActDCF log_x: if True x-axis is plot in log scale - clean_ref: row number containing the values for non-attack result, + clean_ref: row number containing the values for non-attack result, if None, if won't plot the non-attack result file_format: format of the mage file default: pdf xlabel: label for x-axis @@ -468,9 +512,9 @@ def plot_dcf_eer_vs_stat_v1( font_size: global font size colors: colors in string format """ - matplotlib.rc('font', size=font_size) - matplotlib.rc('legend', fontsize=legend_font) - matplotlib.rc('legend', loc=legend_loc) + matplotlib.rc("font", size=font_size) + matplotlib.rc("legend", fontsize=legend_font) + matplotlib.rc("legend", loc=legend_loc) if not isinstance(df, list): df = [df] @@ -478,41 +522,46 @@ def plot_dcf_eer_vs_stat_v1( columns = [c for c in df[0].columns if c != stat_name] ylim = [eer_max, min_dcf_max, act_dcf_max] x = df[0][stat_name].values - #remove infs - noinf = (x != np.inf) + # remove infs + noinf = x != np.inf x = x[noinf] if log_x: - x[x==0] = 0.01 + x[x == 0] = 0.01 for c in columns: - file_path = '%s_%s.%s' % (output_path, c, file_format) + file_path = "%s_%s.%s" % (output_path, c, file_format) t, ylabel = VerificationAdvAttackEvaluator._process_perf_name(c) plt.figure() for i in range(len(df)): y = df[i][c].values - if clean_ref is not None and i==0: + if clean_ref is not None and i == 0: y_clean = y[clean_ref] - if t==0: + if t == 0: y_clean *= 100 - label = None if legends is None else 'original' - plt.hlines(y_clean, np.min(x), np.max(x), - color='k', linestyles='dashed', - linewidth=1.5, label=label) + label = None if legends is None else "original" + plt.hlines( + y_clean, + np.min(x), + np.max(x), + color="k", + linestyles="dashed", + linewidth=1.5, + label=label, + ) y = y[noinf] - if t==0: + if t == 0: y *= 100 label = None if legends is None else legends[i] plt.plot(x, y, fmt[i], linewidth=1.5, label=label) - if log_x: - plt.xscale('log') + plt.xscale("log") if higher_better: - plt.xlim(np.max(x), max(0.1,np.min(x))) + plt.xlim(np.max(x), max(0.1, np.min(x))) else: - plt.xlim(max(0.1,np.min(x)), np.max(x)) + plt.xlim(max(0.1, np.min(x)), np.max(x)) else: if higher_better: plt.xlim(np.max(x), np.min(x)) @@ -522,26 +571,37 @@ def plot_dcf_eer_vs_stat_v1( plt.ylim(0, ylim[t]) plt.ylabel(ylabel) plt.legend() - plt.xlabel('%s perturb. budget.' % (xlabel)) - #plt.xlabel('$L_{\infty}$ perturb. budget.') + plt.xlabel("%s perturb. budget." % (xlabel)) + # plt.xlabel('$L_{\infty}$ perturb. budget.') plt.grid(True) if title is not None: plt.title(title) - #plt.show() + # plt.show() plt.tight_layout() plt.savefig(file_path) plt.clf() plt.close() - @staticmethod def plot_dcf_eer_vs_stat_v2( - df, stat_name, output_path, - eer_max=50., dcf_max=1., log_x=False, - clean_ref=None, file_format='pdf', xlabel='', higher_better=False, - legends=None, title=None, fmt=['b','r','g','m','c','y'], - legend_loc='upper left', - legend_font='medium', font_size=10, colors=None): + df, + stat_name, + output_path, + eer_max=50.0, + dcf_max=1.0, + log_x=False, + clean_ref=None, + file_format="pdf", + xlabel="", + higher_better=False, + legends=None, + title=None, + fmt=["b", "r", "g", "m", "c", "y"], + legend_loc="upper left", + legend_font="medium", + font_size=10, + colors=None, + ): """Plot EER/MinDCF/ActDCF versus stat (SNR, Linf) with matplotlib and save figs to file. In this version minimum and actual DCF are plotted in the same figure. @@ -552,7 +612,7 @@ def plot_dcf_eer_vs_stat_v2( eer_max: y axis maximum value for EER min_dcf: y axis maximum value for DCF log_x: if True x-axis is plot in log scale - clean_ref: row number containing the values for non-attack result, + clean_ref: row number containing the values for non-attack result, if None, if won't plot the non-attack result file_format: format of the mage file default: pdf xlabel: label for x-axis @@ -566,24 +626,29 @@ def plot_dcf_eer_vs_stat_v2( colors: colors in string format """ - matplotlib.rc('font', size=font_size) - matplotlib.rc('legend', fontsize=legend_font) - matplotlib.rc('legend', loc=legend_loc) + matplotlib.rc("font", size=font_size) + matplotlib.rc("legend", fontsize=legend_font) + matplotlib.rc("legend", loc=legend_loc) if not isinstance(df, list): df = [df] - columns = [c for c in df[0].columns if ( - c != stat_name and - VerificationAdvAttackEvaluator._process_perf_name(c)[0]!=2)] + columns = [ + c + for c in df[0].columns + if ( + c != stat_name + and VerificationAdvAttackEvaluator._process_perf_name(c)[0] != 2 + ) + ] ylim = [eer_max, dcf_max, dcf_max] x = df[0][stat_name].values - #remove infs - noinf = (x != np.inf) + # remove infs + noinf = x != np.inf x = x[noinf] if log_x: - x[x==0] = 0.01 + x[x == 0] = 0.01 for c in columns: @@ -591,54 +656,90 @@ def plot_dcf_eer_vs_stat_v2( plt.figure() if t == 0: columns2 = [c] - file_path = '%s_%s.%s' % (output_path, c, file_format) + file_path = "%s_%s.%s" % (output_path, c, file_format) else: - columns2 = [re.sub('min-dcf', 'act-dcf', c), c] - ylabel = re.sub('Min', '', ylabel) - file_path = '%s_%s.%s' % ( - output_path, re.sub('min-dcf', 'dcf', c), file_format) + columns2 = [re.sub("min-dcf", "act-dcf", c), c] + ylabel = re.sub("Min", "", ylabel) + file_path = "%s_%s.%s" % ( + output_path, + re.sub("min-dcf", "dcf", c), + file_format, + ) for k in range(len(columns2)): cc = columns2[k] for i in range(len(df)): y = df[i][cc].values - if clean_ref is not None and i==0: + if clean_ref is not None and i == 0: y_clean = y[clean_ref] - if t==0: + if t == 0: y_clean *= 100 - if k==0: - label = None if legends is None else 'original' - plt.hlines(y_clean, np.min(x), np.max(x), - color='k', linestyles='solid', - linewidth=1.5, label=label) + if k == 0: + label = None if legends is None else "original" + plt.hlines( + y_clean, + np.min(x), + np.max(x), + color="k", + linestyles="solid", + linewidth=1.5, + label=label, + ) else: - plt.hlines(y_clean, np.min(x), np.max(x), - color='k', linestyles='dashed', - linewidth=1.5) + plt.hlines( + y_clean, + np.min(x), + np.max(x), + color="k", + linestyles="dashed", + linewidth=1.5, + ) y = y[noinf] - if t==0: + if t == 0: y *= 100 - if k==0: + if k == 0: label = None if legends is None else legends[i] if colors is None: - plt.plot(x, y, fmt[i], linestyle='solid', linewidth=1.5, label=label) + plt.plot( + x, + y, + fmt[i], + linestyle="solid", + linewidth=1.5, + label=label, + ) else: - plt.plot(x, y, fmt[i], linestyle='solid', linewidth=1.5, label=label, color=colors[i]) + plt.plot( + x, + y, + fmt[i], + linestyle="solid", + linewidth=1.5, + label=label, + color=colors[i], + ) else: if colors is None: - plt.plot(x, y, fmt[i], linestyle='dashed', linewidth=1.5) + plt.plot(x, y, fmt[i], linestyle="dashed", linewidth=1.5) else: - plt.plot(x, y, fmt[i], linestyle='dashed', linewidth=1.5, color=colors[i]) + plt.plot( + x, + y, + fmt[i], + linestyle="dashed", + linewidth=1.5, + color=colors[i], + ) if log_x: - plt.xscale('log') + plt.xscale("log") if higher_better: - plt.xlim(np.max(x), max(0.1,np.min(x))) + plt.xlim(np.max(x), max(0.1, np.min(x))) else: - plt.xlim(max(0.1,np.min(x)), np.max(x)) + plt.xlim(max(0.1, np.min(x)), np.max(x)) else: if higher_better: plt.xlim(np.max(x), np.min(x)) @@ -649,15 +750,13 @@ def plot_dcf_eer_vs_stat_v2( plt.ylabel(ylabel) if legends is not None: plt.legend() - plt.xlabel('%s perturb. budget.' % (xlabel)) - #plt.xlabel('$L_{\infty}$ perturb. budget.') + plt.xlabel("%s perturb. budget." % (xlabel)) + # plt.xlabel('$L_{\infty}$ perturb. budget.') plt.grid(True) if title is not None: plt.title(title) plt.tight_layout() - #plt.show() + # plt.show() plt.savefig(file_path) plt.clf() plt.close() - - diff --git a/hyperion/model_loader.py b/hyperion/model_loader.py index f2a1f41b..30780d7b 100644 --- a/hyperion/model_loader.py +++ b/hyperion/model_loader.py @@ -9,31 +9,28 @@ class ModelLoader(object): - @staticmethod def get_object(): - obj_dict={ 'DiagNormal': DiagNormal, - 'Normal': Normal, - 'DiagGMM': DiagGMM, - 'GMM': GMM, - 'FRPLDA': FRPLDA, - 'SPLDA': SPLDA, - 'CentWhiten': CentWhiten, - 'LNorm': LNorm, - 'PCA': PCA, - 'LDA': LDA, - 'NAP': NAP, - 'SbSw': SbSw, - 'MVN': MVN, - 'TransformList': TransformList} + obj_dict = { + "DiagNormal": DiagNormal, + "Normal": Normal, + "DiagGMM": DiagGMM, + "GMM": GMM, + "FRPLDA": FRPLDA, + "SPLDA": SPLDA, + "CentWhiten": CentWhiten, + "LNorm": LNorm, + "PCA": PCA, + "LDA": LDA, + "NAP": NAP, + "SbSw": SbSw, + "MVN": MVN, + "TransformList": TransformList, + } return obj_dict - - + @staticmethod def load(file_path): - class_name = HypModel.load_config(file_path)['class_name'] + class_name = HypModel.load_config(file_path)["class_name"] class_obj = ModelLoader.get_object()[class_name] return class_obj.load(file_path) - - - diff --git a/hyperion/pdfs/__init__.py b/hyperion/pdfs/__init__.py index f831c88d..91af5497 100644 --- a/hyperion/pdfs/__init__.py +++ b/hyperion/pdfs/__init__.py @@ -8,7 +8,3 @@ from .plda import * from .jfa import * from .hmm import * - - - - diff --git a/hyperion/pdfs/core/__init__.py b/hyperion/pdfs/core/__init__.py index 3af5a750..2defe6d4 100644 --- a/hyperion/pdfs/core/__init__.py +++ b/hyperion/pdfs/core/__init__.py @@ -8,6 +8,3 @@ from .exp_family import ExpFamily from .normal_diag_cov import NormalDiagCov, DiagNormal from .normal import Normal - - - diff --git a/hyperion/pdfs/core/exp_family.py b/hyperion/pdfs/core/exp_family.py index 774fd267..44fc172c 100644 --- a/hyperion/pdfs/core/exp_family.py +++ b/hyperion/pdfs/core/exp_family.py @@ -8,16 +8,15 @@ from abc import ABCMeta, abstractmethod from .pdf import PDF + class ExpFamily(PDF): __metaclass__ = ABCMeta - + def __init__(self, eta=None, **kwargs): super(ExpFamily, self).__init__(**kwargs) self.eta = eta self.A = None - - @property def is_init(self): if not self._is_init: @@ -27,53 +26,40 @@ def is_init(self): self._is_init = True return self._is_init - - - def fit(self, x, sample_weight=None, - x_val=None, sample_weight_val=None, batch_size=None): + def fit( + self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None + ): - N, u_x =self.Estep(x=x, sample_weight=sample_weight, - batch_size=batch_size) + N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) self.Mstep(N, u_x) - elbo=self.elbo(x, N=N, u_x=u_x) - elbo = [elbo, elbo/N] + elbo = self.elbo(x, N=N, u_x=u_x) + elbo = [elbo, elbo / N] - if x_val is not None: - N, u_x = self.Estep(x=x_val, sample_weight=sample_weight_val, - batch_size=batch_size) + N, u_x = self.Estep( + x=x_val, sample_weight=sample_weight_val, batch_size=batch_size + ) elbo_val = self.elbo(x_val, N=N, u_x=u_x) - elbo += [elbo_val, elbo_val/N] + elbo += [elbo_val, elbo_val / N] return elbo - - def log_h(self, x): return 0 - - def accum_log_h(self, x, sample_weight=None): if sample_weight is None: return np.sum(self.log_h(x)) return np.sum(sample_weight * self.log_h(x)) - - def compute_suff_stats(self, x): return x - - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): if u_x is not None or batch_size is None: return self._accum_suff_stats_1batch(x, u_x, sample_weight) else: return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): if u_x is None: u_x = self.compute_suff_stats(x) @@ -85,13 +71,11 @@ def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): acc_u_x = np.sum(u_x, axis=0) return N, acc_u_x - - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): sw_i = None for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1+batch_size, x.shape[0]) - x_i = x[i1:i2,:] + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] if sample_weight is not None: sw_i = sample_weight[i1:i2] N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) @@ -103,91 +87,71 @@ def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): u_x += u_x_i return N, u_x - - def add_suff_stats(self, N, u_x): - assert(len(N)==len(u_x)) + assert len(N) == len(u_x) acc_N = N[1] acc_u_x = u_x[1] - for i in range(1,len(N)): + for i in range(1, len(N)): acc_N += N acc_u_x += u[i] return acc_N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - - @abstractmethod def Mstep(self, stats): pass - - def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): assert self.is_init if u_x is None: - N, u_x = self.accum_suff_stats(x, sample_weight=sample_weight, - batch_size=batch_size) + N, u_x = self.accum_suff_stats( + x, sample_weight=sample_weight, batch_size=batch_size + ) if log_h is None: log_h = self.accum_log_h(x, sample_weight=sample_weight) - return log_h + np.inner(u_x, self.eta) - N*self.A + return log_h + np.inner(u_x, self.eta) - N * self.A - - - def log_prob(self, x, u_x=None, method='nat'): - if method == 'nat': + def log_prob(self, x, u_x=None, method="nat"): + if method == "nat": return self.log_prob_nat(x, u_x) else: return self.log_prob_std(x) - - def log_prob_nat(self, x, u_x = None): + def log_prob_nat(self, x, u_x=None): assert self.is_init if u_x is None: u_x = self.compute_suff_stats(x) return self.log_h(x) + np.inner(u_x, self.eta) - self.A - - @staticmethod def compute_A_nat(eta): raise NotImplementedError() - @staticmethod def compute_A_std(params): raise NotImplementedError() - @staticmethod def compute_eta(param): raise NotImplementedError() - @staticmethod def compute_std(eta): raise NotImplementedError() - @abstractmethod def _compute_nat_params(self): pass - @abstractmethod def _compute_std_params(self): pass - def _compute_nat_std(self): pass - + @abstractmethod def validate(self): pass - - diff --git a/hyperion/pdfs/core/normal.py b/hyperion/pdfs/core/normal.py index e8e6d8f2..b1ff4224 100644 --- a/hyperion/pdfs/core/normal.py +++ b/hyperion/pdfs/core/normal.py @@ -9,16 +9,34 @@ from scipy.special import erf from ...hyp_defs import float_cpu -from ...utils.plotting import plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, plot_gaussian_3D -from ...utils.math import invert_pdmat, invert_trimat, symmat2vec, vec2symmat, fullcov_varfloor, logdet_pdmat +from ...utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, + plot_gaussian_3D, +) +from ...utils.math import ( + invert_pdmat, + invert_trimat, + symmat2vec, + vec2symmat, + fullcov_varfloor, + logdet_pdmat, +) from .exp_family import ExpFamily class Normal(ExpFamily): - - def __init__(self, mu=None, Lambda=None, var_floor=1e-5, - update_mu=True, update_Lambda=True, **kwargs): + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-5, + update_mu=True, + update_Lambda=True, + **kwargs + ): super(Normal, self).__init__(**kwargs) self.mu = mu self.Lambda = Lambda @@ -27,13 +45,11 @@ def __init__(self, mu=None, Lambda=None, var_floor=1e-5, self.update_Lambda = update_Lambda self._compute_nat_std() - + self._logLambda = None self._cholLambda = None self._Sigma = None - - def _compute_nat_std(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() @@ -44,7 +60,6 @@ def _compute_nat_std(self): self.A = self.compute_A_nat(self.eta) self._compute_std_params() - @property def logLambda(self): if self._logLambda is None: @@ -54,8 +69,6 @@ def logLambda(self): self._cholLambda = L.T return self._logLambda - - @property def cholLambda(self): if self._cholLambda is None: @@ -65,8 +78,6 @@ def cholLambda(self): self._cholLambda = L.T return self._cholLambda - - @property def Sigma(self): if self._Sigma is None: @@ -74,28 +85,20 @@ def Sigma(self): self._Sigma = invert_pdmat(self.Lambda, return_inv=True)[-1] return self._Sigma - - def initialize(self): self.validate() self._compute_nat_std() - - def stack_suff_stats(self, F, S=None): if S is None: return F - return np.hstack((F,S)) - + return np.hstack((F, S)) - def unstack_suff_stats(self, stats): - F=stats[:self.x_dim] - S=stats[self.x_dim:] + F = stats[: self.x_dim] + S = stats[self.x_dim :] return F, S - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): if u_x is None: if sample_weight is None: @@ -104,59 +107,54 @@ def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): S = symmat2vec(np.dot(x.T, x)) else: N = np.sum(sample_weight) - wx = sample_weight[:, None]*x + wx = sample_weight[:, None] * x F = np.sum(wx, axis=0) S = symmat2vec(np.dot(wx.T, x)) return N, self.stack_suff_stats(F, S) else: return self._accum_suff_stats_1batch(x, u_x, sample_weight) - - def norm_suff_stats(self, N, u_x, return_order2=False): assert self.is_init - + F, S = self.unstack_suff_stats(u_x) - F_norm = np.dot(F-N*self.mu, self.cholLambda.T) + F_norm = np.dot(F - N * self.mu, self.cholLambda.T) if return_order2: SS = vec2symat(S) Fmu = np.outer(self.F, self.mu) - SS = SS-Fmu-Fmu.T+N*np.outer(self.mu,self.mu) + SS = SS - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) SS = np.dot(self.cholLambda, np.dot(SS, self.cholLambda.T)) S = symmat2vec(SS) return N, self.stack_suff_stats(F_norm, S) return N, F_norm - - def Mstep(self, N, u_x): F, S = self.unstack_suff_stats(u_x) if self.update_mu: - self.mu = F/N + self.mu = F / N if self.update_Lambda: - S = vec2symmat(S/N) - S -= np.outer(self.mu,self.mu) + S = vec2symmat(S / N) + S -= np.outer(self.mu, self.mu) # S = fullcov_varfloor(S, self.var_floor) self.Lambda = invert_pdmat(S, return_inv=True)[-1] self._Sigma = None self._logLambda = None self._cholLambda = None - self._compute_nat_params() - - def log_prob_std(self, x): assert self.is_init - mah_dist2 = np.sum(np.dot(x-self.mu,self.cholLambda)**2, axis=1) - return 0.5*self.logLambda-0.5*self.x_dim*np.log(2*np.pi)-0.5*mah_dist2 + mah_dist2 = np.sum(np.dot(x - self.mu, self.cholLambda) ** 2, axis=1) + return ( + 0.5 * self.logLambda + - 0.5 * self.x_dim * np.log(2 * np.pi) + - 0.5 * mah_dist2 + ) - - # def eval_logcdf(self, x): # delta = np.dot((x-self.mu), self.cholLambda) # lk = 0.5*(1+erf(delta/np.sqrt(2))) @@ -167,127 +165,106 @@ def log_prob_std(self, x): # print(lk) # return np.sum(np.log(lk+1e-20), axis=-1) - - def sample(self, num_samples, rng=None, seed=1024): assert self.is_init - + if rng is None: rng = np.random.RandomState(seed) - return rng.multivariate_normal(self.mu, self.Sigma,size=(num_samples,)).astype(float_cpu()) + return rng.multivariate_normal(self.mu, self.Sigma, size=(num_samples,)).astype( + float_cpu() + ) # x=rng.normal(size=(num_samples, self.x_dim)) # cholS=la.cholesky(self.Sigma, lower=False, overwrite_a=True) # return self.mu+np.dot(x, cholS) - - def get_config(self): - config = {'var_floor': self.var_floor, - 'update_mu': self.update_mu, - 'update_lambda': self.update_Lambda } + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } base_config = super(Normal, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - + assert self.is_init - - params = {'mu': self.mu, - 'Lambda': self.Lambda} - self._save_params_from_dict(f, params) + params = {"mu": self.mu, "Lambda": self.Lambda} + self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'Lambda'] - params = self._load_params_to_dict(f, config['name'], param_list) - return cls(x_dim=config['x_dim'], - mu=params['mu'], Lambda=params['Lambda'], - var_floor=config['var_floor'], - update_mu=config['update_mu'], - update_Lambda=config['update_lambda'], - name=config['name']) - + param_list = ["mu", "Lambda"] + params = self._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) def _validate_mu(self): - assert(self.mu.shape[0] == self.x_dim) - - + assert self.mu.shape[0] == self.x_dim def _validate_Lambda(self): - assert(self.Lambda.shape == (self.x_dim, self.x_dim)) - - + assert self.Lambda.shape == (self.x_dim, self.x_dim) def _validate_eta(self): - assert(self.eta.shape[0] == (self.x_dim**2+3*self.x_dim)/2) - + assert self.eta.shape[0] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 - def validate(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() - + if self.eta is not None: self._validate_eta() - - @staticmethod def compute_eta(mu, Lambda): Lmu = np.dot(mu, Lambda) eta = np.hstack((Lmu, -symmat2vec(Lambda, diag_factor=0.5))) return eta - - @staticmethod def compute_x_dim_from_eta(eta): - x_dim = 0.5*(-3+np.sqrt(9+8*eta.shape[-1])) - assert(int(x_dim)==x_dim) + x_dim = 0.5 * (-3 + np.sqrt(9 + 8 * eta.shape[-1])) + assert int(x_dim) == x_dim return int(x_dim) - - @staticmethod def compute_std(eta): x_dim = Normal.compute_x_dim_from_eta(eta) eta1 = eta[:x_dim] - eta2 = vec2symmat(eta[x_dim:], diag_factor=2)/2 - Lambda = - 2*eta2 + eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 + Lambda = -2 * eta2 f = invert_pdmat(-eta2, right_inv=True)[0] mu = 0.5 * f(eta1) return mu, Lambda - - @staticmethod def compute_A_nat(eta): x_dim = Normal.compute_x_dim_from_eta(eta) eta1 = eta[:x_dim] - eta2 = vec2symmat(eta[x_dim:], diag_factor=2)/2 + eta2 = vec2symmat(eta[x_dim:], diag_factor=2) / 2 f, _, log_minus_eta2 = invert_pdmat(-eta2, right_inv=True, return_logdet=True) - r1 = 0.5*x_dim*np.log(2*np.pi) - r2 = 0.25*np.inner(f(eta1), eta1) - r3 = - 0.5*x_dim*np.log(2) - 0.5*log_minus_eta2 + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = 0.25 * np.inner(f(eta1), eta1) + r3 = -0.5 * x_dim * np.log(2) - 0.5 * log_minus_eta2 return r1 + r2 + r3 - - @staticmethod def compute_A_std(mu, Lambda): x_dim = mu.shape[0] - r1 = 0.5*x_dim*np.log(2*np.pi) - r2 = -0.5*logdet_pdmat(Lambda) - r3 = 0.5*np.inner(np.dot(mu, Lambda), mu) + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -0.5 * logdet_pdmat(Lambda) + r3 = 0.5 * np.inner(np.dot(mu, Lambda), mu) return r1 + r2 + r3 - - def _compute_nat_params(self): self.eta = self.compute_eta(self.mu, self.Lambda) self.A = self.compute_A_std(self.mu, self.Lambda) @@ -299,65 +276,47 @@ def _compute_nat_params(self): # Lambda[np.diag_indices(self.x_dim)] /= 2 # self.eta=np.vstack((lnr, Lmu, symmat2vec(Lambda)[:, None])) - - def _compute_std_params(self): self.mu, self.Lambda = self.compute_std(self.eta) self._cholLambda = None self._logLambda = None self._Sigma = None - - @staticmethod def compute_suff_stats(x): - d=x.shape[1] - u=np.zeros((x.shape[0], int(d+d*(d+1)/2)), dtype=float_cpu()) - u[:,:d]=x - k=d + d = x.shape[1] + u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) + u[:, :d] = x + k = d for i in range(d): for j in range(i, d): - u[:,k]=x[:,i]*x[:,j] - k+=1 + u[:, k] = x[:, i] * x[:, j] + k += 1 return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): assert self.is_init - mu=self.mu[feat_idx] - C=invert_pdmat(self.Lambda, return_inv=True)[-1][feat_idx, feat_idx] + mu = self.mu[feat_idx] + C = invert_pdmat(self.Lambda, return_inv=True)[-1][feat_idx, feat_idx] plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): assert self.is_init - mu=self.mu[feat_idx] + mu = self.mu[feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) - C=invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] + C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): assert self.is_init - mu=self.mu[feat_idx] + mu = self.mu[feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) - C=invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] + C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): assert self.is_init - mu=self.mu[feat_idx] + mu = self.mu[feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) - C=invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] + C = invert_pdmat(self.Lambda, return_inv=True)[-1][i, j] plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) - - - - - - diff --git a/hyperion/pdfs/core/normal_diag_cov.py b/hyperion/pdfs/core/normal_diag_cov.py index e80112dd..562d3899 100644 --- a/hyperion/pdfs/core/normal_diag_cov.py +++ b/hyperion/pdfs/core/normal_diag_cov.py @@ -11,15 +11,26 @@ # import matplotlib.mlab as mlab from ...hyp_defs import float_cpu -from ...utils.plotting import plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, plot_gaussian_3D +from ...utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, + plot_gaussian_3D, +) from .exp_family import ExpFamily class NormalDiagCov(ExpFamily): - - def __init__(self, mu=None, Lambda=None, var_floor=1e-5, - update_mu=True, update_Lambda=True, **kwargs): + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-5, + update_mu=True, + update_Lambda=True, + **kwargs + ): super(NormalDiagCov, self).__init__(**kwargs) self.mu = mu self.Lambda = Lambda @@ -33,8 +44,6 @@ def __init__(self, mu=None, Lambda=None, var_floor=1e-5, self._cholLambda = None self._Sigma = None - - def _compute_nat_std(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() @@ -45,8 +54,6 @@ def _compute_nat_std(self): self.A = self.compute_A_nat(self.eta) self._compute_std_params() - - @property def logLambda(self): if self._logLambda is None: @@ -54,196 +61,163 @@ def logLambda(self): self._logLambda = np.sum(np.log(self.Lambda)) return self._logLambda - - @property def cholLambda(self): if self._cholLambda is None: assert self.is_init self._cholLambda = np.sqrt(self.Lambda) return self._cholLambda - - @property def Sigma(self): if self._Sigma is None: assert self.is_init - self._Sigma = 1./self.Lambda + self._Sigma = 1.0 / self.Lambda return self._Sigma - - def initialize(self): self.validate() self._compute_nat_std() assert self.is_init - - def stack_suff_stats(self, F, S=None): if S is None: return F - return np.hstack((F,S)) - + return np.hstack((F, S)) - def unstack_suff_stats(self, stats): - F=stats[:self.x_dim] - S=stats[self.x_dim:] + F = stats[: self.x_dim] + S = stats[self.x_dim :] return F, S - - def norm_suff_stats(self, N, u_x=None, return_order2=False): assert self.is_init F, S = self.unstack_suff_stats(u_x) - F_norm = self.cholLambda*(F-N*self.mu) + F_norm = self.cholLambda * (F - N * self.mu) if return_order2: - S = S-2*self.mu*F+N*self.mu**2 - S *= self.Lambda + S = S - 2 * self.mu * F + N * self.mu ** 2 + S *= self.Lambda return N, self.stack_suff_stats(F_norm, S) return N, F_norm - - def Mstep(self, N, u_x): F, S = self.unstack_suff_stats(u_x) if self.update_mu: - self.mu = F/N + self.mu = F / N if self.update_Lambda: - S = S/N-self.mu**2 - S[S 0)) + assert self.Lambda.shape[0] == self.x_dim + assert np.all(self.Lambda > 0) - - def _validate_eta(self): - assert(self.eta.shape[0] == self.x_dim*2) - - + assert self.eta.shape[0] == self.x_dim * 2 def validate(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() - + if self.eta is not None: self._validate_eta() - - @staticmethod def compute_eta(mu, Lambda): - Lmu = Lambda*mu - eta = np.hstack((Lmu, -0.5*Lambda)) + Lmu = Lambda * mu + eta = np.hstack((Lmu, -0.5 * Lambda)) return eta - - @staticmethod def compute_std(eta): - x_dim = int(eta.shape[0]/2) + x_dim = int(eta.shape[0] / 2) eta1 = eta[:x_dim] eta2 = eta[x_dim:] - mu = -0.5*eta1/eta2 - Lambda = -2*eta2 + mu = -0.5 * eta1 / eta2 + Lambda = -2 * eta2 return mu, Lambda - - @staticmethod def compute_A_nat(eta): - x_dim = int(eta.shape[0]/2) + x_dim = int(eta.shape[0] / 2) eta1 = eta[:x_dim] eta2 = eta[x_dim:] - r1 = 0.5 * x_dim*np.log(2*np.pi) - r2 = -1/4 * np.sum(eta1*eta1/eta2) - r3 = -1/2 * np.sum(np.log(-2*eta2)) + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -1 / 4 * np.sum(eta1 * eta1 / eta2) + r3 = -1 / 2 * np.sum(np.log(-2 * eta2)) return r1 + r2 + r3 - - @staticmethod def compute_A_std(mu, Lambda): x_dim = mu.shape[0] - r1 = 0.5*x_dim*np.log(2*np.pi) - r2 = -0.5*np.sum(np.log(Lambda)) - r3 = 0.5*np.sum(mu*mu*Lambda) + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -0.5 * np.sum(np.log(Lambda)) + r3 = 0.5 * np.sum(mu * mu * Lambda) return r1 + r2 + r3 - def _compute_nat_params(self): self.eta = self.compute_eta(self.mu, self.Lambda) self.A = self.compute_A_nat(self.eta) @@ -252,48 +226,39 @@ def _compute_nat_params(self): # lnr = 0.5*self.lnLambda - 0.5*self.x_dim*np.log(2*np.pi)-0.5*muLmu # self.eta=np.hstack((lnr, Lmu, -0.5*self.Lambda)).T - def _compute_std_params(self): self.mu, self.Lambda = self.compute_std(self.eta) self._cholLambda = None self._logLambda = None self._Sigma = None - - + @staticmethod def compute_suff_stats(x): d = x.shape[1] - u = np.zeros((x.shape[0],2*d), dtype=float_cpu()) - u[:,:d] = x - u[:,d:] = x*x + u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) + u[:, :d] = x + u[:, d:] = x * x return u - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[feat_idx] - C=1/self.Lambda[feat_idx] + mu = self.mu[feat_idx] + C = 1 / self.Lambda[feat_idx] plot_gaussian_1D(mu, C, num_sigmas, num_pts, **kwargs) - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[feat_idx] - C=np.diag(1./self.Lambda[feat_idx]) + mu = self.mu[feat_idx] + C = np.diag(1.0 / self.Lambda[feat_idx]) plot_gaussian_ellipsoid_2D(mu, C, num_sigmas, num_pts, **kwargs) - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[feat_idx] - C=np.diag(1./self.Lambda[feat_idx]) + mu = self.mu[feat_idx] + C = np.diag(1.0 / self.Lambda[feat_idx]) plot_gaussian_3D(mu, C, num_sigmas, num_pts, **kwargs) - - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, - **kwargs): - mu=self.mu[feat_idx] - C=np.diag(1./self.Lambda[feat_idx]) - plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) - + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + mu = self.mu[feat_idx] + C = np.diag(1.0 / self.Lambda[feat_idx]) + plot_gaussian_ellipsoid_3D(mu, C, num_sigmas, num_pts, **kwargs) DiagNormal = NormalDiagCov diff --git a/hyperion/pdfs/core/pdf.py b/hyperion/pdfs/core/pdf.py index b93090dd..2764780c 100644 --- a/hyperion/pdfs/core/pdf.py +++ b/hyperion/pdfs/core/pdf.py @@ -8,6 +8,7 @@ from abc import ABCMeta, abstractmethod from ...hyp_model import HypModel + class PDF(HypModel): __metaclass__ = ABCMeta @@ -15,27 +16,21 @@ def __init__(self, x_dim=1, **kwargs): super(PDF, self).__init__(**kwargs) self.x_dim = x_dim - def get_config(self): - config = {'x_dim': self.x_dim } + config = {"x_dim": self.x_dim} base_config = super(PDF, self).get_config() return dict(list(base_config.items()) + list(config.items())) - @abstractmethod def log_prob(self, x): pass - - + def eval_llk(self, x): return self.log_prob(x) - @abstractmethod def sample(self, num_samples): pass - def generate(self, num_samples, **kwargs): return self.generate(num_samples, **kwargs) - diff --git a/hyperion/pdfs/hmm/hmm.py b/hyperion/pdfs/hmm/hmm.py index 2fbffdad..aeec994a 100644 --- a/hyperion/pdfs/hmm/hmm.py +++ b/hyperion/pdfs/hmm/hmm.py @@ -9,80 +9,74 @@ from ...utils.math import softmax, logsumexp from ..core import PDF -class HMM(PDF): +class HMM(PDF): def __init__(self, **kwargs): - super(HMM, self).__init__(num_states=1, pi=None, trans=None, trans_mask=None, - update_pi=True, update_trans=True, - tied_trans = False, - left_to_right = False, - **kwargs) + super(HMM, self).__init__( + num_states=1, + pi=None, + trans=None, + trans_mask=None, + update_pi=True, + update_trans=True, + tied_trans=False, + left_to_right=False, + **kwargs + ) if pi is not None: num_states = len(pi) - + self.num_states = num_states self.pi = pi self.trans = trans self.trans_mask = trans_mask - + self.update_pi = update_pi self.update_trans = update_trans self.tied_trans = tied_trans self.left_to_right = left_to_right - + if left_to_rigth and (trans_mask is None): self.trans_mask = np.triu(np.ones_like(self.trans)) self._log_pi = None self._log_trans = None - - def reset_aux(): self._log_pi = None self._log_trans = None - @property def is_init(self): if self._is_init: return True - + if self.pi is not None and self.trans is not None: self.validate() - self._is_init =True + self._is_init = True return self._is_init - - - + @property def log_pi(self): if self._log_pi is None: - self._log_pi = np.log(self.pi+1e-15) + self._log_pi = np.log(self.pi + 1e-15) return self._log_pi - @property def log_trans(self): if self._log_trans is None: - self._log_trans = np.log(self.trans+1e-15) + self._log_trans = np.log(self.trans + 1e-15) return self._log_trans - - def validate(self): - assert(len(self.pi) == self.num_states) - assert(self.trans.shape[0] == self.num_states) - assert(self.trans.shape[1] == self.num_states) + assert len(self.pi) == self.num_states + assert self.trans.shape[0] == self.num_states + assert self.trans.shape[1] == self.num_states if self.trans_mask is not None: assert self.trans_mask.shape == self.trans.shape - - def fit(self, x, sample_weight=None, - x_val=None, sample_weight_val=None, - epochs=10): - + def fit(self, x, sample_weight=None, x_val=None, sample_weight_val=None, epochs=10): N_val_tot = 0 elbo = np.zeros((epochs,), dtype=float_cpu()) @@ -90,10 +84,10 @@ def fit(self, x, sample_weight=None, stats = None for epoch in range(epochs): for i in range(x.shape[0]): - stats =self.Estep(x[i], stats) + stats = self.Estep(x[i], stats) pz, Nzz = stats elbo[epoch] += self.elbo(x[i], pz=pz, Nzz=Nzz) - + self.Mstep(stats) if x_val is not None: @@ -103,12 +97,10 @@ def fit(self, x, sample_weight=None, N_tot = np.sum([x_i.shape[0] for x_i in x]) if x_val is None: - return elbo, elbo/N_tot + return elbo, elbo / N_tot else: N_val_tot = np.sum([x_i.shape[0] for x_i in x_val]) - return elbo, elbo/N_tot, elbo_val, elbo_val/N_val_tot - - + return elbo, elbo / N_tot, elbo_val, elbo_val / N_val_tot def forward(self, x): # x = log P(x|z) @@ -116,25 +108,23 @@ def forward(self, x): log_alpha = np.zeros((N, self.num_states), dtype=float_cpu()) log_alpha[0] = self.log_pi + x[0] for n in range(1, N): - log_alpha[n] = x[n] + logsumexp(log_alpha[n-1][:, None] + self.log_trans, axis=0) - - return log_alpha + log_alpha[n] = x[n] + logsumexp( + log_alpha[n - 1][:, None] + self.log_trans, axis=0 + ) + return log_alpha - def backward(self, x): N = x.shape[0] log_beta = np.zeros((N, self.num_states), dtype=float_cpu()) log_beta[-1] = 1 - for n in range(N-2, -1, -1): - r = log_beta[n+1] + x[n+1] + self.log_trans + for n in range(N - 2, -1, -1): + r = log_beta[n + 1] + x[n + 1] + self.log_trans log_beta[n] = logsumexp(r.T, axis=0) return log_beta - - def compute_pz(self, x, return_Nzz=False, return_log_px=False): log_alpha = self.forward(x) log_beta = self.backward(x) @@ -142,7 +132,7 @@ def compute_pz(self, x, return_Nzz=False, return_log_px=False): pz = softmax(log_alpha + log_beta, axis=-1) - if not(return_Nzz or return_elbo or return_log_px): + if not (return_Nzz or return_elbo or return_log_px): return pz r = [pz] @@ -161,20 +151,14 @@ def compute_pz(self, x, return_Nzz=False, return_log_px=False): return tuple(r) - - def elbo(self, x, pz=None, Nzz=None): if pz is None: pz, Nzz = self.compute_pz(x, return_Nzz=True) Nz = pz[0] - elbo = (np.sum(Nz*self.log_pi) + - np.sum(Nzz*self.log_trans) + - np.sum(pz*x)) + elbo = np.sum(Nz * self.log_pi) + np.sum(Nzz * self.log_trans) + np.sum(pz * x) return elbo - - def Estep(self, x, stats_0=None): if stats_0 is None: @@ -187,46 +171,40 @@ def Estep(self, x, stats_0=None): Nz += pz[0] Nzz += pzz stats = (Nz, Nzz) - - return pz, stats + return pz, stats - def Mstep(self, stats): Nz, Nzz = stats - self.pi = Nz/np.sum(Nz) - self.trans = Nzz/np.sum(Nzz, axis=-1, keepdims=True) + self.pi = Nz / np.sum(Nz) + self.trans = Nzz / np.sum(Nzz, axis=-1, keepdims=True) if self.tied_trans: p_loop = np.mean(np.diag(self.trans)) - self.trans[:] = (1-p_loop)/self.num_states + self.trans[:] = (1 - p_loop) / self.num_states self.trans[np.diag_indices(self.num_states)] = p_loop - + if self.trans_mask is not None: self.trans *= self.trans_mask self.trans /= np.sum(self.trans, axis=-1, keepdims=True) self.reset_aux() - - def log_predictive(self, x): # log p(x_{N+1}|x_1,..,x_N} assert self.is_init - + log_alpha = self.forward(x)[:-1] log_px = np.sum(log_alpha, axis=-1, keepdims=True) log_alpha_e = np.expand_dims(log_alpha, axis=-1) log_trans_e = np.expand_dims(self.log_trans, axis=0) - - log_pred = logsumexp(log_alpha_e+log_trans_e, axis=1) - log_pred = logsumexp(log_pred+x[1:], axis=-1) - log_px - return log_pred - + log_pred = logsumexp(log_alpha_e + log_trans_e, axis=1) + log_pred = logsumexp(log_pred + x[1:], axis=-1) - log_px + return log_pred def viterbi_decode(self, x, nbest=1): assert self.is_init @@ -234,25 +212,23 @@ def viterbi_decode(self, x, nbest=1): phi = np.zeros((x.shape[0], self.num_states), dtype=int) w = self.log_pi + x[0] for i in range(x.shape[0]): - u = w[:,None] + self.log_trans + u = w[:, None] + self.log_trans k_max = np.argmax(u, axis=0) - w = x[i] + u[k_max,idx_aux] - phi[i-1] = k_max + w = x[i] + u[k_max, idx_aux] + phi[i - 1] = k_max best = np.fliplr(np.argsort(w))[:nbest] log_pxz = w[best] paths = np.zeros((nbest, x.shape[0]), dtype=int) for n in range(nbest): k_max = best[n] - paths[n,-1] = k_max - for i in range(x.shape[0]-2,-1,-1): + paths[n, -1] = k_max + for i in range(x.shape[0] - 2, -1, -1): k_max = phi[i, k_max] - paths[n,i] = k_max + paths[n, i] = k_max return paths, log_pxz - - - + def sample(self, num_seqs, num_steps, rng=None, seed=1024): if rng is None: rng = np.random.RandomState(seed) @@ -261,36 +237,30 @@ def sample(self, num_seqs, num_steps, rng=None, seed=1024): x[:, 0, :] = rng.multinomial(1, self.pi, size=(num_seqs,)) for t in range(1, num_steps): for k in range(self.num_states): - index = x[:,t-1,k] == 1 + index = x[:, t - 1, k] == 1 n_k = num.sum(index) if n_k == 0: continue x[index] = rng.multinomial(1, self.trans[k], size=(n_k,)) return x - - def get_config(self): - config = {'update_pi': self.update_pi, - 'update_trans': self.update_trans, - 'tied_trans': self.tied_trans, - 'left_to_right': self.left_to_right} + config = { + "update_pi": self.update_pi, + "update_trans": self.update_trans, + "tied_trans": self.tied_trans, + "left_to_right": self.left_to_right, + } base_config = super(HMM, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - - def save_params(self, f): - params = {'pi': self.pi, - 'trans': self.trans} + params = {"pi": self.pi, "trans": self.trans} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['pi', 'trans'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(pi=params['pi'], trans=params['trans'], **config) + param_list = ["pi", "trans"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(pi=params["pi"], trans=params["trans"], **config) diff --git a/hyperion/pdfs/jfa/__init__.py b/hyperion/pdfs/jfa/__init__.py index d6ba7b82..2207d957 100644 --- a/hyperion/pdfs/jfa/__init__.py +++ b/hyperion/pdfs/jfa/__init__.py @@ -5,6 +5,3 @@ from .jfa_total import JFATotal - - - diff --git a/hyperion/pdfs/jfa/jfa_total.py b/hyperion/pdfs/jfa/jfa_total.py index 961adb3f..74fe0f95 100644 --- a/hyperion/pdfs/jfa/jfa_total.py +++ b/hyperion/pdfs/jfa/jfa_total.py @@ -7,12 +7,17 @@ from scipy import linalg as sla from ...hyp_defs import float_cpu -from ...utils.math import invert_pdmat, invert_trimat, logdet_pdmat, vec2symmat, symmat2vec +from ...utils.math import ( + invert_pdmat, + invert_trimat, + logdet_pdmat, + vec2symmat, + symmat2vec, +) from ..core.pdf import PDF class JFATotal(PDF): - def __init__(self, K, y_dim=None, T=None, **kwargs): super(JFATotal, self).__init__(**kwargs) if T is not None: @@ -22,16 +27,13 @@ def __init__(self, K, y_dim=None, T=None, **kwargs): self.y_dim = y_dim self.T = T - #aux + # aux self._TT = None self.__upptr = None - def reset_aux(self): self._TT = None - - @property def is_init(): if self._is_init: @@ -40,51 +42,47 @@ def is_init(): self._is_init = True return self._is_init - def initialize(self, N, F): assert N.shape[0] == self.K - - self.T = np.random.randn(self.y_dim, F.shape[1]).astype( - float_cpu(), copy=False) - + self.T = np.random.randn(self.y_dim, F.shape[1]).astype(float_cpu(), copy=False) - def compute_py_g_x(self, N , F, G=None, return_cov=False, return_elbo=False, - return_acc=False): + def compute_py_g_x( + self, N, F, G=None, return_cov=False, return_elbo=False, return_acc=False + ): assert self.is_init - x_dim = int(F.shape[1]/self.K) + x_dim = int(F.shape[1] / self.K) M = F.shape[0] y_dim = self.y_dim - + compute_inv = return_cov or return_acc return_tuple = compute_inv or return_elbo TF = np.dot(F, self.T.T) L = self.compute_L(self.TT, N, self._upptr) y = np.zeros((M, y_dim), dtype=float_cpu()) - + if return_cov: - Sy = np.zeros((M, y_dim*(y_dim+1)/2), dtype=float_cpu()) + Sy = np.zeros((M, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) else: Sy = None if return_elbo: elbo = np.zeros((M,), dtype=float_cpu()) - + if return_acc: Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) - Ry = np.zeros((self.K, y_dim*(y_dim+1)/2), dtype=float_cpu()) + Ry = np.zeros((self.K, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) - Li = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) for i in range(N.shape[0]): Li[self._upptr] = L[i] - r = invert_pdmat(Li, right_inv=True, - return_logdet=return_elbo, - return_inv=compute_inv) + r = invert_pdmat( + Li, right_inv=True, return_logdet=return_elbo, return_inv=compute_inv + ) mult_iL = r[0] if return_elbo: - elbo[i] = - r[2]/2 + elbo[i] = -r[2] / 2 if compute_inv: iL = r[-1] @@ -96,8 +94,8 @@ def compute_py_g_x(self, N , F, G=None, return_cov=False, return_elbo=False, if return_acc: iL += np.outer(y[i], y[i]) Py += iL - Ry += iL[self.__uppr]*N[i][:,None] - + Ry += iL[self.__uppr] * N[i][:, None] + if not return_tuple: return y @@ -105,102 +103,102 @@ def compute_py_g_x(self, N , F, G=None, return_cov=False, return_elbo=False, if return_cov: r += [Sy] - + if return_elbo: if G is not None: elbo += G - elbo += 0.5*np.sum(VF*y, axis=-1) + elbo += 0.5 * np.sum(VF * y, axis=-1) r += [elbo] if return_acc: r += [Ry, Py] - + return tuple(r) - - def Estep(self, N, F, G=None): - + y, elbo, Ry, Py = self.compute_py_g_x( - N, F, G, return_elbo=True, return_acc=True) + N, F, G, return_elbo=True, return_acc=True + ) M = y.shape[0] y_acc = np.sum(y, axis=0) Cy = np.dot(F, y) - + elbo = np.sum(elbo) - stats = (elbo, M, y_acc, Ry, Cy, Py) + stats = (elbo, M, y_acc, Ry, Cy, Py) return stats - - def MstepML(self, stats): _, M, y_acc, Ry, Cy, _ = stats T = np.zeros_like(self.T) Ryk = np.zeros((self.y_dim, self.y_dim), dtype=float_cpu()) - x_dim = T.shape[1]/self.K + x_dim = T.shape[1] / self.K for k in range(self.K): - idx = k*x_dim + idx = k * x_dim Ryk[self._upptr] = Ry[k] iRyk_mult = invert_pdmat(Ryk, right_inv=False)[0] - T[:, idx:idx+x_dim] = iRyk_mult(Cy[idx:idx+x_dim].T) + T[:, idx : idx + x_dim] = iRyk_mult(Cy[idx : idx + x_dim].T) self.T = T self.reset_aux() - - def MstepMD(self, stats): _, M, y_acc, Ry, Cy, Py = stats - mu_y = y_acc/M - Cy = Py/M - np.outer(my_y, mu_y) + mu_y = y_acc / M + Cy = Py / M - np.outer(my_y, mu_y) chol_Cy = la.cholesky(Cy, lower=False, overwrite_a=True) - self.T = np.dot(chol_Cy , self.T) - - self.reset_aux() + self.T = np.dot(chol_Cy, self.T) + self.reset_aux() - - def fit(self, N, F, G=None, N_val=None, F_val=None, epochs=20, ml_md='ml+md', md_epochs=None): + def fit( + self, + N, + F, + G=None, + N_val=None, + F_val=None, + epochs=20, + ml_md="ml+md", + md_epochs=None, + ): + + use_ml = False if ml_md == "md" else True + use_md = False if ml_md == "ml" else True - use_ml = False if ml_md == 'md' else True - use_md = False if ml_md == 'ml' else True - if not self.is_init: self.initialize(N, F) elbo = np.zeros((epochs,), dtype=float_cpu()) elbo_val = np.zeros((epochs,), dtype=float_cpu()) for epoch in range(epochs): - + stats = self.Estep(N, F, G) elbo[epoch] = stats[0] if N_val is not None and F_val is not None: - _, elbo_val_e=self.compute_py_x(N, F, G, return_elbo=True) - elbo_val[epoch]=np.sum(elbo_val_e) + _, elbo_val_e = self.compute_py_x(N, F, G, return_elbo=True) + elbo_val[epoch] = np.sum(elbo_val_e) if use_ml: self.MstepML(stats) if use_md and (md_epochs is None or epoch in md_epochs): self.MstepMD(stats) - elbo_norm= elbo/np.sum(N) + elbo_norm = elbo / np.sum(N) if x_val is None: return elbo, elbo_norm else: - elbo_val_norm = elbo_val/np.sum(N_val) + elbo_val_norm = elbo_val / np.sum(N_val) return elbo, elbo_norm, elbo_val, elbo_val_norm - - @property def TT(self): if self._TT is None: self._TT = self.compute_TT(self.T, self.K) return self._TT - @property def _upptr(self): if self.__upptr is None: @@ -208,64 +206,53 @@ def _upptr(self): self.__upptr = np.triu(I).ravel() return self.__upptr - - @staticmethod def compute_TT(self, T, K, upptr): - x_dim = int(T.shape[1]/K) + x_dim = int(T.shape[1] / K) y_dim = T.shape[0] - TT = np.zeros((K, y_dim*(y_dim+1)/2), dtype=float_cpu()) + TT = np.zeros((K, y_dim * (y_dim + 1) / 2), dtype=float_cpu()) for k in range(K): - idx = k*x_dim - T_k = T[:,idx:idx+x_dim] + idx = k * x_dim + T_k = T[:, idx : idx + x_dim] TT_k = np.dot(T_k, T_k.T) TT[k] = TT_k[self._upptr] - - return TT + return TT - @staticmethod def compute_L(TT, N, upptr): y_dim = self._upptr.shape[0] I = np.eye(y_dim, dtype=float_cpu())[self._upptr] - return I+np.dot(N, TT) - + return I + np.dot(N, TT) @staticmethod def normalize_T(T, chol_prec): Tnorm = np.zeros_like(T) K = chol_prec.shape[0] - x_dim = int(T.shape[1]/K) + x_dim = int(T.shape[1] / K) for k in range(K): - idx = k*x_dim - Tnorm[:,idx:idx+x_dim] = np.dot(T[:,idx:idx+x_dim], chol_prec[k].T) - - return Tnorm + idx = k * x_dim + Tnorm[:, idx : idx + x_dim] = np.dot( + T[:, idx : idx + x_dim], chol_prec[k].T + ) + return Tnorm - def get_config(self): - config = {'K': self.K } + config = {"K": self.K} base_config = super(JFATotal, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {'T': self.T} + params = {"T": self.T} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['T'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["T"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - def sample(self, num_samples): pass diff --git a/hyperion/pdfs/mixtures/__init__.py b/hyperion/pdfs/mixtures/__init__.py index a55052f3..f9168905 100644 --- a/hyperion/pdfs/mixtures/__init__.py +++ b/hyperion/pdfs/mixtures/__init__.py @@ -4,10 +4,7 @@ """ - from .exp_family_mixture import ExpFamilyMixture from .gmm_diag_cov import GMMDiagCov, DiagGMM -from .gmm_tied_diag_cov import GMMTiedDiagCov, DiagGMMTiedCov +from .gmm_tied_diag_cov import GMMTiedDiagCov, DiagGMMTiedCov from .gmm import GMM - - diff --git a/hyperion/pdfs/mixtures/exp_family_mixture.py b/hyperion/pdfs/mixtures/exp_family_mixture.py index db7e6f9e..113bb8fc 100644 --- a/hyperion/pdfs/mixtures/exp_family_mixture.py +++ b/hyperion/pdfs/mixtures/exp_family_mixture.py @@ -15,9 +15,10 @@ class ExpFamilyMixture(PDF): __metaclass__ = ABCMeta - - def __init__(self, num_comp=1, pi=None, eta=None, min_N=0, - update_pi=True, **kwargs): + + def __init__( + self, num_comp=1, pi=None, eta=None, min_N=0, update_pi=True, **kwargs + ): super().__init__(**kwargs) if pi is not None: num_comp = len(pi) @@ -29,156 +30,158 @@ def __init__(self, num_comp=1, pi=None, eta=None, min_N=0, self._log_pi = None self.update_pi = update_pi - @property def is_init(self): if not self._is_init: - if (self.eta is not None and self.A is not None and - self.pi is not None): + if self.eta is not None and self.A is not None and self.pi is not None: self.validate() self._is_init = True return self._is_init - @property def log_pi(self): if self._log_pi is None: - self._log_pi = np.log(self.pi+1e-15) + self._log_pi = np.log(self.pi + 1e-15) return self._log_pi - - def _validate_pi(self): assert len(self.pi) == self.num_comp - - - def fit(self, x, sample_weight=None, - x_val=None, sample_weight_val=None, - epochs=10, batch_size=None): + def fit( + self, + x, + sample_weight=None, + x_val=None, + sample_weight_val=None, + epochs=10, + batch_size=None, + ): if not self.is_init: self.initialize(x) - + log_h = self.accum_log_h(x, sample_weight) if x_val is not None: log_h_val = self.accum_log_h(x_val, sample_weight_val) - + elbo = np.zeros((epochs,), dtype=float_cpu()) elbo_val = np.zeros((epochs,), dtype=float_cpu()) for epoch in range(epochs): - N, u_x =self.Estep(x=x, sample_weight=sample_weight, - batch_size=batch_size) - elbo[epoch]=self.elbo(None, N=N, u_x=u_x, log_h=log_h) + N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) + elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) self.Mstep(N, u_x) - + if x_val is not None: - N, u_x = self.Estep(x=x_val, sample_weight=sample_weight_val, - batch_size=batch_size) + N, u_x = self.Estep( + x=x_val, sample_weight=sample_weight_val, batch_size=batch_size + ) elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) if x_val is None: - return elbo, elbo/x.shape[0] + return elbo, elbo / x.shape[0] else: - return elbo, elbo/x.shape[0], elbo_val, elbo_val/x.shape[0] - - - - def fit_generator(self, generator, train_steps, epochs=10, - val_data=None, val_steps=0, - max_queue_size=10, workers=1, use_multiprocessing=False): + return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] + + def fit_generator( + self, + generator, + train_steps, + epochs=10, + val_data=None, + val_steps=0, + max_queue_size=10, + workers=1, + use_multiprocessing=False, + ): do_validation = bool(validation_data) - val_gen = (hasattr(validation_data, 'next') or - hasattr(validation_data, '__next__') or - isinstance(validation_data, Sequence)) + val_gen = ( + hasattr(validation_data, "next") + or hasattr(validation_data, "__next__") + or isinstance(validation_data, Sequence) + ) if val_gen and not validation_steps: - raise ValueError('When using a generator for validation data, ' - 'you must specify a value for ' - '`validation_steps`.') + raise ValueError( + "When using a generator for validation data, " + "you must specify a value for " + "`validation_steps`." + ) if do_validation and not val_gen: x, u_x_val, sample_weight_val = self.tuple2data(val_data) log_h_val = self.accum_log_h(x, sample_weight_val) - + elbo = np.zeros((epochs,), dtype=float_cpu()) elbo_val = np.zeros((epochs,), dtype=float_cpu()) for epoch in range(epochs): - N, u_x, log_h =self.Estep_generator( - generator, train_steps, return_log_h=True, - max_queue_size=max_queue_size, workers=workers, - use_multiprocessing=use_multiprocessing) - + N, u_x, log_h = self.Estep_generator( + generator, + train_steps, + return_log_h=True, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + ) + self.Mstep(N, u_x) - elbo[epoch]=self.elbo(None, N=N, u_x=u_x, log_h=log_h) - + elbo[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h) + if val_data is not None: if val_gen: N, u_x, log_h_val = self.Estep_generator( - generator, train_steps, return_log_h = True, - max_queue_size=max_queue_size, workers=workers, - use_multiprocessing=use_multiprocessing) + generator, + train_steps, + return_log_h=True, + max_queue_size=max_queue_size, + workers=workers, + use_multiprocessing=use_multiprocessing, + ) else: N, u_x = self.Estep(x_val, u_x_val, sample_weight_val) elbo_val[epoch] = self.elbo(None, N=N, u_x=u_x, log_h=log_h_val) if x_val is None: - return elbo, elbo/x.shape[0] + return elbo, elbo / x.shape[0] else: - return elbo, elbo/x.shape[0], elbo_val, elbo_val/x.shape[0] + return elbo, elbo / x.shape[0], elbo_val, elbo_val / x.shape[0] - - def log_h(self, x): return 0 - def accum_log_h(self, x, sample_weight=None): if sample_weight is None: return np.sum(self.log_h(x)) return np.sum(sample_weight * self.log_h(x)) - - def compute_log_pz(self, x, u_x=None, mode='nat'): + def compute_log_pz(self, x, u_x=None, mode="nat"): if u_x is None: u_x = self.compute_suff_stats(x) return np.dot(u_x, self.eta.T) - self.A + self.log_pi - - def compute_pz(self, x, u_x=None, mode='nat'): - if mode == 'nat': + def compute_pz(self, x, u_x=None, mode="nat"): + if mode == "nat": return self.compute_pz_nat(x, u_x) else: return self.compute_pz_std(x) - - def compute_pz_nat(self, x, u_x=None): if u_x is None: u_x = self.compute_suff_stats(x) logr = np.dot(u_x, self.eta.T) - self.A + self.log_pi return softmax(logr) - - def compute_pz_std(self, x): return self.compute_pz_nat(x) - - def compute_suff_stats(self, x): return x - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): if u_x is not None or batch_size is None: return self._accum_suff_stats_1batch(x, u_x, sample_weight) else: return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): if u_x is None: u_x = self.compute_suff_stats(x) @@ -191,13 +194,11 @@ def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): # L_z=gmm.ElnP_z_w(N,gmm.lnw)-gmm.Elnq_z(z); return N, acc_u_x - - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): sw_i = None for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1+batch_size, x.shape[0]) - x_i = x[i1:i2,:] + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] if sample_weight is not None: sw_i = sample_weight[i1:i2] N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) @@ -209,9 +210,9 @@ def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): u_x += u_x_i return N, u_x - - - def accum_suff_stats_segments(self, x, segments, u_x=None, sample_weight=None, batch_size=None): + def accum_suff_stats_segments( + self, x, segments, u_x=None, sample_weight=None, batch_size=None + ): K = self.num_comp num_segments = len(segments) N = np.zeros((num_segments, K), dtype=float_cpu()) @@ -220,32 +221,35 @@ def accum_suff_stats_segments(self, x, segments, u_x=None, sample_weight=None, b sw_i = None for i in range(num_segments): start = int(segments[i][0]) - end = int(segments[i][1])+1 + end = int(segments[i][1]) + 1 x_i = x[start:end] if u_x is not None: u_x_i = u_x[start:end] if sample_weight is not None: sw_i = sample_weight[start:end] N_i, acc_u_x_i = self.accum_suff_stats( - x_i, u_x=u_x_i, sample_weight=sw_i, batch_size=batch_size) + x_i, u_x=u_x_i, sample_weight=sw_i, batch_size=batch_size + ) N[i] = N_i acc_u_x[i] = acc_u_x_i return N, acc_u_x - - - def accum_suff_stats_segments_prob(self, x, prob, u_x=None, sample_weight=None, batch_size=None): + def accum_suff_stats_segments_prob( + self, x, prob, u_x=None, sample_weight=None, batch_size=None + ): if u_x is not None or batch_size is None: return self._accum_suff_stats_segments_prob_1batch( - x, prob, u_x, sample_weight) + x, prob, u_x, sample_weight + ) else: return self._accum_suff_stats_segments_prob_nbatches( - x, prob, sample_weight, batch_size) - + x, prob, sample_weight, batch_size + ) - - def _accum_suff_stats_segments_prob_1batch(self, x, prob, u_x=None, sample_weight=None): + def _accum_suff_stats_segments_prob_1batch( + self, x, prob, u_x=None, sample_weight=None + ): if u_x is None: u_x = self.compute_suff_stats(x) z = self.compute_pz_nat(x, u_x) @@ -255,27 +259,29 @@ def _accum_suff_stats_segments_prob_1batch(self, x, prob, u_x=None, sample_weigh K = len(self.pi) num_segments = prob.shape[1] N = np.zeros((num_segments, K), float_cpu()) - acc_u_x=np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) + acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) for i in range(num_segments): - z_i = z*prob[:,i][:, None] + z_i = z * prob[:, i][:, None] N[i] = np.sum(z_i, axis=0) acc_u_x[i] = np.dot(z_i.T, u_x) return N, acc_u_x + def _accum_suff_stats_segments_prob_nbatches( + self, x, prob, sample_weight, batch_size + ): - - def _accum_suff_stats_segments_prob_nbatches(self, x, prob, sample_weight, batch_size): - sw_i = None for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1+batch_size, x.shape[0]) - x_i = x[i1:i2,:] - prob_i = prob[i1:i2,:] + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] + prob_i = prob[i1:i2, :] if sample_weight is not None: sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_segments_prob_1batch(x_i, prob_i, sample_weight=sw_i) + N_i, u_x_i = self._accum_suff_stats_segments_prob_1batch( + x_i, prob_i, sample_weight=sw_i + ) if i1 == 0: N = N_i u_x = u_x_i @@ -284,23 +290,31 @@ def _accum_suff_stats_segments_prob_nbatches(self, x, prob, sample_weight, batch u_x += u_x_i return N, u_x - - - def accum_suff_stats_sorttime(self, x, frame_length, frame_shift,u_x=None, sample_weight=None, batch_size=None): + def accum_suff_stats_sorttime( + self, + x, + frame_length, + frame_shift, + u_x=None, + sample_weight=None, + batch_size=None, + ): if u_x is not None or batch_size is None: return self._accum_suff_stats_sorttime_1batch( - x, frame_length, frame_shift, u_x, sample_weight) + x, frame_length, frame_shift, u_x, sample_weight + ) else: return self._accum_suff_stats_sorttime_nbatches( - x, frame_length, frame_shift, sample_weight, batch_size) + x, frame_length, frame_shift, sample_weight, batch_size + ) - - - def _accum_suff_stats_sorttime_1batch(self, x, frame_length, frame_shift, u_x=None, sample_weight=None): + def _accum_suff_stats_sorttime_1batch( + self, x, frame_length, frame_shift, u_x=None, sample_weight=None + ): K = len(self.pi) num_frames = x.shape[0] - num_segments = int(np.floor((num_frames-frame_length)/frame_shift + 1)) + num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) if num_segments == 1: return self._accum_suff_stats_1batch(self, x, u_x, sample_weight) @@ -313,75 +327,80 @@ def _accum_suff_stats_sorttime_1batch(self, x, frame_length, frame_shift, u_x=No N = np.zeros((num_segments, K), float_cpu()) acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) - start1 = int(frame_shift-1) - end1 = int((num_segments-1) * frame_shift) + start1 = int(frame_shift - 1) + end1 = int((num_segments - 1) * frame_shift) start2 = int(start1 + frame_length) end2 = int(end1 + frame_length) cum_N = np.cumsum(z, axis=0) - N[0] = cum_N[frame_length-1] - N[1:] = (cum_N[start2:end2:frame_shift] - - cum_N[start1:end1:frame_shift]) - + N[0] = cum_N[frame_length - 1] + N[1:] = cum_N[start2:end2:frame_shift] - cum_N[start1:end1:frame_shift] + for k in range(K): - cum_u_x_k = np.cumsum(z[:,k][:,None] * u_x, axis=0) - acc_u_x[0,k] = cum_u_x_k[frame_length-1] - acc_u_x[1:,k] = (cum_u_x_k[start2:end2:frame_shift] - - cum_u_x_k[start1:end1:frame_shift]) + cum_u_x_k = np.cumsum(z[:, k][:, None] * u_x, axis=0) + acc_u_x[0, k] = cum_u_x_k[frame_length - 1] + acc_u_x[1:, k] = ( + cum_u_x_k[start2:end2:frame_shift] - cum_u_x_k[start1:end1:frame_shift] + ) return N, acc_u_x - - - def _accum_suff_stats_sorttime_nbatches(self, x, frame_length, frame_shift, sample_weight, batch_size): + def _accum_suff_stats_sorttime_nbatches( + self, x, frame_length, frame_shift, sample_weight, batch_size + ): K = len(self.pi) num_frames = x.shape[0] - num_segments = int(np.floor((num_frames-frame_length)/frame_shift + 1)) + num_segments = int(np.floor((num_frames - frame_length) / frame_shift + 1)) if num_segments == 1: return self._accum_suff_stats_1batch(self, x, u_x, sample_weight) - num_segments_per_batch = np.floor((num_frames-frame_length)/frame_shift + 1) - batch_size = int((num_segments_per_batch-1)*frame_shift+frame_length) - batch_shift = int(num_segments_per_batch*frame_shift) + num_segments_per_batch = np.floor((num_frames - frame_length) / frame_shift + 1) + batch_size = int((num_segments_per_batch - 1) * frame_shift + frame_length) + batch_shift = int(num_segments_per_batch * frame_shift) N = np.zeros((num_segments, K), float_cpu()) acc_u_x = np.zeros((num_segments, K, self.eta.shape[1]), float_cpu()) - + sw_i = None - cur_segment=0 + cur_segment = 0 for i1 in range(0, x.shape[0], batch_shift): - i2 = np.minimum(i1+batch_size, x.shape[0]) - x_i = x[i1:i2,:] + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] if sample_weight is not None: sw_i = sample_weight[i1:i2] - N_i, u_x_i = self._accum_suff_stats_sorttime_1batch(x_i, frame_length, frame_shift, sample_weight=sw_i) + N_i, u_x_i = self._accum_suff_stats_sorttime_1batch( + x_i, frame_length, frame_shift, sample_weight=sw_i + ) num_segments_i = N_i.shape[0] - N[cur_segment:cur_segment+num_segments_i] = N_i - acc_u_x[cur_segment:cur_segment+num_segments_i] = u_x_i + N[cur_segment : cur_segment + num_segments_i] = N_i + acc_u_x[cur_segment : cur_segment + num_segments_i] = u_x_i cur_segment += num_segments_i return N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - - - def Estep_generator(self, generator, num_steps, return_log_h, - max_queue_size=10, workers=1, use_multiprocessin=False): - wait_time = 0.01 # in secs + def Estep_generator( + self, + generator, + num_steps, + return_log_h, + max_queue_size=10, + workers=1, + use_multiprocessin=False, + ): + wait_time = 0.01 # in secs queue = None N = None acc_u_x = None log_h = 0 try: - queue = GeneratorQueue(generator, - use_multiprocessing=use_multiprocessing, - wait_time=wait_time) + queue = GeneratorQueue( + generator, use_multiprocessing=use_multiprocessing, wait_time=wait_time + ) queue.start(workers=workers, max_queue_size=max_queue_size) queue_generator = queue.get() - + cur_step = 0 for cur_step in range(num_steps): data = next(queue_generator) @@ -398,100 +417,80 @@ def Estep_generator(self, generator, num_steps, return_log_h, finally: if enqueuer is not None: enqueuer.stop() - + if return_log_h: return N, acc_u_x, log_h else: return N, acc_u_x - def sum_suff_stats(self, N, u_x): - assert len(N)==len(u_x) + assert len(N) == len(u_x) acc_N = N[1] acc_u_x = u_x[1] - for i in range(1,len(N)): + for i in range(1, len(N)): acc_N += N acc_u_x += u[i] return acc_N, acc_u_x - @abstractmethod def Mstep(self, stats): pass - def elbo(self, x, u_x=None, N=1, log_h=None, sample_weight=None, batch_size=None): if u_x is None: - N, u_x = self.accum_suff_stats(x, sample_weight=sample_weight, - batch_size=batch_size) + N, u_x = self.accum_suff_stats( + x, sample_weight=sample_weight, batch_size=batch_size + ) if log_h is None: log_h = self.accum_log_h(x, sample_weight=sample_weight) return log_h + np.sum(u_x * self.eta) + np.inner(N, self.log_pi - self.A) - - def log_prob(self, x, u_x=None, mode='nat'): - if mode == 'nat': + def log_prob(self, x, u_x=None, mode="nat"): + if mode == "nat": return self.log_prob_nat(x, u_x) else: return self.log_prob_std(x) - - def log_prob_nat(self, x, u_x = None): + def log_prob_nat(self, x, u_x=None): if u_x is None: u_x = self.compute_suff_stats(x) llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi llk = logsumexp(llk_k) return self.log_h(x) + llk - @abstractmethod def log_prob_std(self, x): pass - - - - def log_prob_nbest(self, x, u_x=None, mode='nat', - nbest_mode='master', nbest=1): - if mode == 'nat': - return self.log_prob_nbest_nat( - x, u_x, nbest_mode=nbest_mode, nbest=nbest) + + def log_prob_nbest(self, x, u_x=None, mode="nat", nbest_mode="master", nbest=1): + if mode == "nat": + return self.log_prob_nbest_nat(x, u_x, nbest_mode=nbest_mode, nbest=nbest) else: - return self.log_prob_std( - x, nbest_mode=nbest_mode, nbest=nbest) + return self.log_prob_std(x, nbest_mode=nbest_mode, nbest=nbest) + def log_prob_nbest_nat(self, x, u_x=None, nbest_mode="master", nbest=1): - - def log_prob_nbest_nat(self, x, u_x=None, - nbest_mode='master', nbest=1): - if u_x is None: u_x = self.compute_suff_stats(x) - if nbest_mode == 'master': + if nbest_mode == "master": assert isinstance(nbest, int) llk_k = np.dot(u_x, self.eta.T) - self.A + self.log_pi - nbest = np.argsort(llk_k)[:-(nbest+1):-1] + nbest = np.argsort(llk_k)[: -(nbest + 1) : -1] llk_k = llk_k[nbest] else: - llk_k = np.dot(u_x, self.eta[nbest,:].T) - self.A + self.log_pi + llk_k = np.dot(u_x, self.eta[nbest, :].T) - self.A + self.log_pi llk = logsumexp(llk_k) return self.log_h(x) + llk - @abstractmethod - def log_prob_nbest_std(self, x, - nbest_mode='master', nbest=1): + def log_prob_nbest_std(self, x, nbest_mode="master", nbest=1): pass - - def get_config(self): - config = {'min_n': self.min_N, - 'update_pi': self.update_pi } + config = {"min_n": self.min_N, "update_pi": self.update_pi} base_config = super(ExpFamilyMixture, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def tuple2data(data): if isinstance(data, tuple): @@ -503,45 +502,37 @@ def tuple2data(data): sample_weight = u_x u_x = None else: - raise ValueError('Generator output: ' + str(data)) + raise ValueError("Generator output: " + str(data)) elif len(data) == 3: x, u_x, sample_weight = data else: - raise ValueError('Generator output: ' + str(data)) + raise ValueError("Generator output: " + str(data)) else: x = data u_x = None sample_weight = None return x, u_x, sample_weight - @staticmethod def compute_A_nat(eta): raise NotImplementedError() - @staticmethod def compute_A_std(params): raise NotImplementedError() - @staticmethod def compute_eta(param): raise NotImplementedError() - @staticmethod def compute_std(eta): raise NotImplementedError() - @abstractmethod def _compute_nat_params(self): pass - @abstractmethod def _compute_std_params(self): pass - - diff --git a/hyperion/pdfs/mixtures/gmm.py b/hyperion/pdfs/mixtures/gmm.py index d1512114..b71f0a61 100644 --- a/hyperion/pdfs/mixtures/gmm.py +++ b/hyperion/pdfs/mixtures/gmm.py @@ -9,8 +9,22 @@ from ...hyp_defs import float_cpu -from ...utils.math import softmax, logsumexp, invert_pdmat, invert_trimat, symmat2vec, vec2symmat, fullcov_varfloor, logdet_pdmat -from ...utils.plotting import plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, plot_gaussian_3D +from ...utils.math import ( + softmax, + logsumexp, + invert_pdmat, + invert_trimat, + symmat2vec, + vec2symmat, + fullcov_varfloor, + logdet_pdmat, +) +from ...utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, + plot_gaussian_3D, +) from ...clustering import KMeans from ..core import Normal @@ -18,10 +32,15 @@ class GMM(ExpFamilyMixture): - - def __init__(self, mu=None, Lambda=None, var_floor=1e-3, - update_mu=True, update_Lambda=True, - **kwargs): + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-3, + update_mu=True, + update_Lambda=True, + **kwargs + ): super().__init__(**kwargs) self.mu = mu self.Lambda = Lambda @@ -30,13 +49,11 @@ def __init__(self, mu=None, Lambda=None, var_floor=1e-3, self.update_Lambda = update_Lambda self._compute_gmm_nat_std() - + self._logLambda = None self._cholLambda = None self._Sigma = None - - def _compute_gmm_nat_std(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() @@ -47,70 +64,60 @@ def _compute_gmm_nat_std(self): self.A = self.compute_A_nat(self.eta) self._compute_std_params() - - def compute_Lambda_aux(self): self._logLambda = np.zeros((self.num_comp,), dtype=float_cpu()) - self._cholLambda = np.zeros((self.num_comp, self.x_dim, self.x_dim), - dtype=float_cpu()) + self._cholLambda = np.zeros( + (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() + ) for i, L in enumerate(self.Lambda): f, L, logL = invert_pdmat(L, return_logdet=True) self._logLambda[i] = logL self._cholLambda[i] = L.T - - @property def logLambda(self): if self._logLambda is None: self.compute_Lambda_aux() return self._logLambda - - @property def cholLambda(self): if self._cholLambda is None: self.compute_Lambda_aux() return self._cholLambda - - @property def Sigma(self): if self._Sigma is None: - self._Sigma = np.zeros((self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu()) + self._Sigma = np.zeros( + (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() + ) for k in range(self.num_comp): self._Sigma[k] = invert_pdmat(self.Lambda[k], return_inv=True)[-1] return self._Sigma - - def initialize(self, x=None): if x is None and self.mu is None and self.eta is None: - assert self.num_comp==1 + assert self.num_comp == 1 self._initialize_stdnormal() if x is not None: self._initialize_kmeans(self.num_comp, x) self.validate() self._compute_gmm_nat_std() - def _initialize_stdnormal(self): self.pi = np.array([1], dtype=float_cpu()) self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) self.Lambda[0] = np.eye(self.x_dim, dtype=float_cpu()) - - def _initialize_kmeans(self, num_comp, x): - if num_comp==1: + if num_comp == 1: self.pi = np.array([1], dtype=float_cpu()) self.mu = np.mean(x, axis=0, keepdims=True) self.Lambda = np.zeros((1, self.x_dim, self.x_dim), dtype=float_cpu()) delta = x - self.mu - S = np.dot(delta.T, delta)/x.shape[0] + S = np.dot(delta.T, delta) / x.shape[0] self.Lambda[0] = invert_pdmat(S, return_inv=True)[-1] return @@ -119,62 +126,55 @@ def _initialize_kmeans(self, num_comp, x): self.mu = kmeans.mu self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) - self.Lambda = np.zeros((self.num_comp, self.x_dim, self.x_dim), - dtype=float_cpu()) + self.Lambda = np.zeros( + (self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu() + ) for k in range(num_comp): - r = cluster_index==k - self.pi[k] = np.sum(r)/x.shape[0] + r = cluster_index == k + self.pi[k] = np.sum(r) / x.shape[0] delta = x[r] - self.mu[k] - S = np.dot(delta.T, delta)/np.sum(r) + S = np.dot(delta.T, delta) / np.sum(r) self.Lambda[k] = invert_pdmat(S, return_inv=True)[-1] - - def stack_suff_stats(self, F, S=None): if S is None: return F - return np.hstack((F,S)) - + return np.hstack((F, S)) - def unstack_suff_stats(self, stats): - F=stats[:,:self.x_dim] - S=stats[:,self.x_dim:] + F = stats[:, : self.x_dim] + S = stats[:, self.x_dim :] return F, S - - def norm_suff_stats(self, N, u_x, return_order2=False): F, S = self.unstack_suff_stats(u_x) - F_norm = F - N[:,None]*self.mu + F_norm = F - N[:, None] * self.mu for k in range(self.num_comp): F_norm[k] = np.dot(F_norm[k], self.cholLambda[k].T) if return_order2: SS = vec2symat(S[k]) Fmu = np.outer(self.F[k], self.mu[k]) - SS = SS-Fmu-Fmu.T+N*np.outer(self.mu[k],self.mu[k]) + SS = SS - Fmu - Fmu.T + N * np.outer(self.mu[k], self.mu[k]) SS = np.dot(self.cholLambda[k], np.dot(SS, self.cholLambda[k].T)) S[k] = symmat2vec(SS) if return_order2: return N, self.stack_suff_stats(F_norm, S) return N, F_norm - - def Mstep(self, N, u_x): F, S = self.unstack_suff_stats(u_x) if self.update_mu: - self.mu = F/N[:,None] + self.mu = F / N[:, None] if self.update_Lambda: C = np.zeros((self.num_comp, self.x_dim, self.x_dim), dtype=float_cpu()) for k in range(self.num_comp): - C[k] = vec2symmat(S[k]/N[k]) - C[k] -= np.outer(self.mu[k],self.mu[k]) - Sfloor = self.var_floor*np.mean(C, axis=0) + C[k] = vec2symmat(S[k] / N[k]) + C[k] -= np.outer(self.mu[k], self.mu[k]) + Sfloor = self.var_floor * np.mean(C, axis=0) cholfloor = la.cholesky(Sfloor, overwrite_a=True) for k in range(self.num_comp): C[k] = fullcov_varfloor(C[k], cholfloor, F_is_chol=True) @@ -189,49 +189,43 @@ def Mstep(self, N, u_x): N[N0] = 0 mu[N0] = 0 S[N0] = 1 - self.pi = N/np.sum(N) + self.pi = N / np.sum(N) self._log_pi = None self._compute_nat_params() - - def split_comp(self, K=2): num_comp = self.num_comp * K - pi = np.repeat(self.pi, K)/K - Lambda = np.repeat(self.Lambda, K, axis=0)*(K**2) + pi = np.repeat(self.pi, K) / K + Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) mu = np.repeat(self.mu, K, axis=0) - + for g in range(self.num_comp): w, v = la.eigh(self.Sigma[g]) v *= np.sqrt(v) - if K==2: + if K == 2: std_dev = np.sum(v, axis=1) - mu[2*g] += std_dev - mu[2*g+1] -= std_dev + mu[2 * g] += std_dev + mu[2 * g + 1] -= std_dev else: for k in range(K): - factor = 2*(np.random.uniform(size=(v.shape[1],)) > 0.5) - 1 - std_dev = np.sum(v*factor, axis=1) - mu[K*g+k] += std_dev + factor = 2 * (np.random.uniform(size=(v.shape[1],)) > 0.5) - 1 + std_dev = np.sum(v * factor, axis=1) + mu[K * g + k] += std_dev config = self.get_config() return GMM(pi=pi, mu=mu, Lambda=Lambda, **config) - - def log_prob_std(self, x): - r0 = self.log_pi + 0.5*self.logLambda - 0.5*self.x_dim*np.log(2*np.pi) + r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) - for k in range(self.num_comp): - mah_dist2 = np.sum(np.dot(x-self.mu[k], self.cholLambda[k])**2, axis=1) - llk_k[:,k] = r0[k] - 0.5*mah_dist2 - - return logsumexp(llk_k, axis=-1) + for k in range(self.num_comp): + mah_dist2 = np.sum(np.dot(x - self.mu[k], self.cholLambda[k]) ** 2, axis=1) + llk_k[:, k] = r0[k] - 0.5 * mah_dist2 + return logsumexp(llk_k, axis=-1) - def sample(self, num_samples, rng=None, seed=1024): if rng is None: rng = np.random.RandomState(seed) @@ -239,46 +233,45 @@ def sample(self, num_samples, rng=None, seed=1024): r = rng.multinomial(1, self.pi, size=(num_samples,)) x = np.zeros((num_samples, self.x_dim), dtype=float_cpu()) for k in range(self.num_comp): - index = r[:, k]==1 + index = r[:, k] == 1 n_k = np.sum(index) if n_k == 0: continue x[index] = rng.multivariate_normal( - self.mu[k], self.Sigma[k], size=(n_k,)).astype(float_cpu()) - - return x + self.mu[k], self.Sigma[k], size=(n_k,) + ).astype(float_cpu()) + return x - def get_config(self): - config = {'var_floor': self.var_floor, - 'update_mu': self.update_mu, - 'update_lambda': self.update_Lambda } + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {'pi': self.pi, - 'mu': self.mu, - 'Lambda': self.Lambda} + params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['pi', 'mu', 'Lambda'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(x_dim=config['x_dim'], pi=params['pi'], - mu=params['mu'], Lambda=params['Lambda'], - var_floor=config['var_floor'], - min_N=config['min_n'], - update_pi=config['update_pi'], - update_mu=config['update_mu'], update_Lambda=config['update_lambda'], - name=config['name']) - + param_list = ["pi", "mu", "Lambda"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + pi=params["pi"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + min_N=config["min_n"], + update_pi=config["update_pi"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) @classmethod def load_from_kaldi(cls, file_path): @@ -288,7 +281,7 @@ def load_from_kaldi(cls, file_path): num_comp = 0 x_dim = 0 success = False - with open(file_path, 'r') as f: + with open(file_path, "r") as f: while True: line = f.readline() if not line: @@ -297,57 +290,52 @@ def load_from_kaldi(cls, file_path): if fields[0] == "": pi = np.array([float(v) for v in fields[2:-1]], dtype=float_cpu()) num_comp = len(pi) - elif fields[0]=="": + elif fields[0] == "": for k in range(num_comp): line = f.readline() fields = line.split() if x_dim == 0: x_dim = len(fields) eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) - eta2 = np.zeros((num_comp, int((x_dim**2+3*x_dim)/2)), dtype=float_cpu()) - - assert(len(fields) == x_dim or len(fields) == x_dim+1) - eta1[k] = [ float(v) for v in fields[:x_dim] ] - elif fields[0]=="": + eta2 = np.zeros( + (num_comp, int((x_dim ** 2 + 3 * x_dim) / 2)), + dtype=float_cpu(), + ) + + assert len(fields) == x_dim or len(fields) == x_dim + 1 + eta1[k] = [float(v) for v in fields[:x_dim]] + elif fields[0] == "": L = np.zeros((x_dim, x_dim), dtype=float_cpu()) for k in range(num_comp): - L[:,:] = 0 + L[:, :] = 0 for j in range(x_dim): line = f.readline() fields = line.split() - if j < x_dim -1: - assert(len(fields) == j+1) + if j < x_dim - 1: + assert len(fields) == j + 1 else: - assert(len(fields) == x_dim+1) - L[j,:j+1] = [ float(v) for v in fields[:j+1] ] - eta2[k] = - symmat2vec(L.T, diag_factor=0.5) - if k == num_comp-1: + assert len(fields) == x_dim + 1 + L[j, : j + 1] = [float(v) for v in fields[: j + 1]] + eta2[k] = -symmat2vec(L.T, diag_factor=0.5) + if k == num_comp - 1: success = True - assert(success) + assert success eta = np.hstack((eta1, eta2)) return cls(x_dim=x_dim, pi=pi, eta=eta) - - def _validate_mu(self): - assert(self.mu.shape[0] == self.num_comp) - assert(self.mu.shape[1] == self.x_dim) - + assert self.mu.shape[0] == self.num_comp + assert self.mu.shape[1] == self.x_dim - def _validate_Lambda(self): - assert(self.Lambda.shape[0] == self.num_comp) - assert(self.Lambda.shape[1] == self.x_dim) - assert(self.Lambda.shape[2] == self.x_dim) + assert self.Lambda.shape[0] == self.num_comp + assert self.Lambda.shape[1] == self.x_dim + assert self.Lambda.shape[2] == self.x_dim - - def _validate_eta(self): - assert(self.eta.shape[0] == self.num_comp) - assert(self.eta.shape[1] == (self.x_dim**2+3*self.x_dim)/2) - + assert self.eta.shape[0] == self.num_comp + assert self.eta.shape[1] == (self.x_dim ** 2 + 3 * self.x_dim) / 2 - def validate(self): if self.pi is not None: self._validate_pi() @@ -355,35 +343,30 @@ def validate(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() - + if self.eta is not None: self._validate_eta() - @staticmethod def compute_eta(mu, Lambda): x_dim = mu.shape[-1] - eta_dim = int((x_dim**2+3*x_dim)/2) + eta_dim = int((x_dim ** 2 + 3 * x_dim) / 2) eta = np.zeros((mu.shape[0], eta_dim), dtype=float_cpu()) for k in range(mu.shape[0]): eta[k] = Normal.compute_eta(mu[k], Lambda[k]) return eta - - @staticmethod def compute_std(eta): x_dim = Normal.compute_x_dim_from_eta(eta) mu = np.zeros((eta.shape[0], x_dim), dtype=float_cpu()) - Lambda = np.zeros((eta.shape[0], x_dim, x_dim), dtype='float32') + Lambda = np.zeros((eta.shape[0], x_dim, x_dim), dtype="float32") for k in range(eta.shape[0]): mu[k], Lambda[k] = Normal.compute_std(eta[k]) return mu, Lambda - - @staticmethod def compute_A_nat(eta): A = np.zeros((eta.shape[0],), dtype=float_cpu()) @@ -392,8 +375,6 @@ def compute_A_nat(eta): return A - - @staticmethod def compute_A_std(mu, Lambda): A = np.zeros((mu.shape[0],), dtype=float_cpu()) @@ -402,68 +383,51 @@ def compute_A_std(mu, Lambda): return A - - def _compute_nat_params(self): self.eta = self.compute_eta(self.mu, self.Lambda) self.A = self.compute_A_nat(self.eta) - - def _compute_std_params(self): self.mu, self.Lambda = self.compute_std(self.eta) self._cholLambda = None self._logLambda = None self._Sigma = None - - @staticmethod def compute_suff_stats(x): - d=x.shape[1] - u=np.zeros((x.shape[0], int(d+d*(d+1)/2)), dtype=float_cpu()) - u[:,:d]=x - k=d + d = x.shape[1] + u = np.zeros((x.shape[0], int(d + d * (d + 1) / 2)), dtype=float_cpu()) + u[:, :d] = x + k = d for i in range(d): for j in range(i, d): - u[:,k]=x[:,i]*x[:,j] - k+=1 + u[:, k] = x[:, i] * x[:, j] + k += 1 return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] + mu = self.mu[:, feat_idx] for k in range(mu.shape[0]): C = invert_pdmat(self.Lambda[k], return_inv=True)[-1][feat_idx, feat_idx] plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] + mu = self.mu[:, feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) for k in range(mu.shape[0]): C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] - plot_gaussian_ellipsoid_2D( - mu[k], C_k, num_sigmas, num_pts, **kwargs) - + plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] + mu = self.mu[:, feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) for k in range(mu.shape[0]): C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, - **kwargs): - mu=self.mu[:,feat_idx] + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + mu = self.mu[:, feat_idx] j, i = np.meshgrid(feat_idx, feat_idx) for k in range(mu.shape[0]): C_k = invert_pdmat(self.Lambda[k], return_inv=True)[-1][i, j] - plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, - **kwargs) + plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) diff --git a/hyperion/pdfs/mixtures/gmm_diag_cov.py b/hyperion/pdfs/mixtures/gmm_diag_cov.py index e99b3e98..b586a900 100644 --- a/hyperion/pdfs/mixtures/gmm_diag_cov.py +++ b/hyperion/pdfs/mixtures/gmm_diag_cov.py @@ -9,32 +9,40 @@ from ...hyp_defs import float_cpu from ...utils.math import softmax, logsumexp -from ...utils.plotting import plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, plot_gaussian_3D +from ...utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, + plot_gaussian_3D, +) from ...clustering import KMeans from .exp_family_mixture import ExpFamilyMixture class GMMDiagCov(ExpFamilyMixture): - - def __init__(self, mu=None, Lambda=None, var_floor=1e-3, - update_mu=True, update_Lambda=True, - **kwargs): + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-3, + update_mu=True, + update_Lambda=True, + **kwargs + ): super().__init__(**kwargs) self.mu = mu self.Lambda = Lambda self.var_floor = var_floor self.update_mu = update_mu self.update_Lambda = update_Lambda - + self._compute_gmm_nat_std() - + self._logLambda = None self._cholLambda = None self._Sigma = None - - def _compute_gmm_nat_std(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() @@ -44,56 +52,44 @@ def _compute_gmm_nat_std(self): self._validate_eta() self.A = self.compute_A_nat(self.eta) self._compute_std_params() - - @property def logLambda(self): if self._logLambda is None: self._logLambda = np.sum(np.log(self.Lambda), axis=-1) return self._logLambda - - @property def cholLambda(self): if self._cholLambda is None: self._cholLambda = np.sqrt(self.Lambda) return self._cholLambda - - @property def Sigma(self): if self._Sigma is None: - self._Sigma = 1./self.Lambda + self._Sigma = 1.0 / self.Lambda return self._Sigma - - def initialize(self, x=None): if x is None and self.mu is None and self.eta is None: - assert self.num_comp==1 + assert self.num_comp == 1 self._initialize_stdnormal() if x is not None: self._initialize_kmeans(self.num_comp, x) self.validate() self._compute_gmm_nat_std() - - def _initialize_stdnormal(self): self.pi = np.array([1], dtype=float_cpu()) self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) self.Lambda = np.ones((1, self.x_dim), dtype=float_cpu()) - - def _initialize_kmeans(self, num_comp, x): - if num_comp==1: - self.pi=np.array([1], dtype=float_cpu()) - self.mu=np.mean(x, axis=0, keepdims=True) - self.Lambda=1/np.std(x, axis=0, keepdims=True)**2 + if num_comp == 1: + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.mean(x, axis=0, keepdims=True) + self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 return kmeans = KMeans(num_clusters=num_comp) @@ -101,53 +97,44 @@ def _initialize_kmeans(self, num_comp, x): self.mu = kmeans.mu self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) - self.Lambda = np.zeros((self.num_comp, x.shape[-1]), - dtype=float_cpu()) + self.Lambda = np.zeros((self.num_comp, x.shape[-1]), dtype=float_cpu()) for k in range(num_comp): - r=cluster_index==k - self.pi[k] = np.sum(r)/x.shape[0] - self.Lambda[k] = 1/np.std(x[r], axis=0)**2 + r = cluster_index == k + self.pi[k] = np.sum(r) / x.shape[0] + self.Lambda[k] = 1 / np.std(x[r], axis=0) ** 2 - - def stack_suff_stats(self, F, S=None): if S is None: return F - return np.hstack((F,S)) - + return np.hstack((F, S)) - def unstack_suff_stats(self, stats): - F=stats[:,:self.x_dim] - S=stats[:,self.x_dim:] + F = stats[:, : self.x_dim] + S = stats[:, self.x_dim :] return F, S - - def norm_suff_stats(self, N, u_x, return_order2=False): F, S = self.unstack_suff_stats(acc_u_x) - F_norm = self.cholLambda*(F-N[:,None]*self.mu) + F_norm = self.cholLambda * (F - N[:, None] * self.mu) if return_order2: - S=S-2*self.mu*F+N*self.mu**2 + S = S - 2 * self.mu * F + N * self.mu ** 2 S *= self.Lambda - return N, self.stack_suff_stats(F_norm, S) + return N, self.stack_suff_stats(F_norm, S) return N, F_norm - - def Mstep(self, N, u_x): F, S = self.unstack_suff_stats(u_x) if self.update_mu: - self.mu = F/N[:, None] + self.mu = F / N[:, None] if self.update_Lambda: - S = S/N[:,None] - self.mu**2 - S_floor = self.var_floor * np.mean(S[N>self.min_N], axis=0) + S = S / N[:, None] - self.mu ** 2 + S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) S = np.maximum(S, S_floor) - self.Lambda=1/S + self.Lambda = 1 / S self._Sigma = S self._cholLambda = None self._logLambda = None @@ -158,100 +145,90 @@ def Mstep(self, N, u_x): N[N0] = 0 mu[N0] = 0 S[N0] = 1 - self.pi = N/np.sum(N) + self.pi = N / np.sum(N) self._log_pi = None - + self._compute_nat_params() - - def split_comp(self, K=2): - std_dev = 1/self.cholLambda + std_dev = 1 / self.cholLambda num_comp = self.num_comp * K - pi = np.repeat(self.pi, K)/K - Lambda = np.repeat(self.Lambda, K, axis=0)*(K**2) + pi = np.repeat(self.pi, K) / K + Lambda = np.repeat(self.Lambda, K, axis=0) * (K ** 2) mu = np.repeat(self.mu, K, axis=0) - - if K==2: + + if K == 2: mu[::2] += std_dev mu[1::2] -= std_dev else: for k in range(K): - factor = 2*(np.random.uniform(size=std_dev.shape) > 0.5) - 1 - mu[k::K] += factor*std_dev + factor = 2 * (np.random.uniform(size=std_dev.shape) > 0.5) - 1 + mu[k::K] += factor * std_dev config = self.get_config() return GMMDiagCov(pi=pi, mu=mu, Lambda=Lambda, **config) - - def log_prob_std(self, x): - r0 = self.log_pi + 0.5*self.logLambda-0.5*self.x_dim*np.log(2*np.pi) + r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): - mah_dist2 = np.sum(((x-self.mu[k])*self.cholLambda[k])**2, axis=-1) - llk_k[:,k] = r0[k] - 0.5*mah_dist2 - return logsumexp(llk_k, axis=-1) - + mah_dist2 = np.sum(((x - self.mu[k]) * self.cholLambda[k]) ** 2, axis=-1) + llk_k[:, k] = r0[k] - 0.5 * mah_dist2 + return logsumexp(llk_k, axis=-1) - def log_cdf(self, x): llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): - delta = (x-self.mu[k])*self.cholLambda[k] - lk = 0.5*(1+erf(delta/np.sqrt(2))) - llk_k[:,k] = self.log_pi[k] + np.sum(np.log(lk+1e-20), axis=-1) + delta = (x - self.mu[k]) * self.cholLambda[k] + lk = 0.5 * (1 + erf(delta / np.sqrt(2))) + llk_k[:, k] = self.log_pi[k] + np.sum(np.log(lk + 1e-20), axis=-1) return logsumexp(llk_k) - - def sample(self, num_samples, rng=None, seed=1024): if rng is None: - rng=np.random.RandomState(seed) + rng = np.random.RandomState(seed) r = rng.multinomial(1, self.pi, size=(num_samples,)) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) for k in range(self.num_comp): - index = r[:, k]==1 - x[index] = 1./self.cholLambda[k]*x[index] + self.mu[k] - - return x + index = r[:, k] == 1 + x[index] = 1.0 / self.cholLambda[k] * x[index] + self.mu[k] + return x - def get_config(self): - config = {'var_floor': self.var_floor, - 'update_mu': self.update_mu, - 'update_lambda': self.update_Lambda } + config = { + "var_floor": self.var_floor, + "update_mu": self.update_mu, + "update_lambda": self.update_Lambda, + } base_config = super(GMMDiagCov, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = { 'pi': self.pi, - 'mu': self.mu, - 'Lambda': self.Lambda} + params = {"pi": self.pi, "mu": self.mu, "Lambda": self.Lambda} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['pi', 'mu', 'Lambda'] - params = self._load_params_to_dict(f, config['name'], param_list) - return cls(x_dim=config['x_dim'], pi=params['pi'], - mu=params['mu'], Lambda=params['Lambda'], - var_floor=config['var_floor'], - min_N=config['min_n'], - update_pi=config['update_pi'], - update_mu=config['update_mu'], update_Lambda=config['update_lambda'], - name=config['name']) - + param_list = ["pi", "mu", "Lambda"] + params = self._load_params_to_dict(f, config["name"], param_list) + return cls( + x_dim=config["x_dim"], + pi=params["pi"], + mu=params["mu"], + Lambda=params["Lambda"], + var_floor=config["var_floor"], + min_N=config["min_n"], + update_pi=config["update_pi"], + update_mu=config["update_mu"], + update_Lambda=config["update_lambda"], + name=config["name"], + ) @classmethod def load_from_kaldi(cls, file_path): @@ -261,7 +238,7 @@ def load_from_kaldi(cls, file_path): num_comp = 0 x_dim = 0 success = False - with open(file_path, 'r') as f: + with open(file_path, "r") as f: while True: line = f.readline() if not line: @@ -270,7 +247,7 @@ def load_from_kaldi(cls, file_path): if fields[0] == "": pi = np.array([float(v) for v in fields[2:-1]], dtype=float_cpu()) num_comp = len(pi) - elif fields[0]=="": + elif fields[0] == "": for k in range(num_comp): line = f.readline() fields = line.split() @@ -278,153 +255,122 @@ def load_from_kaldi(cls, file_path): x_dim = len(fields) eta1 = np.zeros((num_comp, x_dim), dtype=float_cpu()) eta2 = np.zeros((num_comp, x_dim), dtype=float_cpu()) - - assert(len(fields) == x_dim or len(fields) == x_dim+1) - eta1[k] = [ float(v) for v in fields[:x_dim] ] - elif fields[0]=="": + + assert len(fields) == x_dim or len(fields) == x_dim + 1 + eta1[k] = [float(v) for v in fields[:x_dim]] + elif fields[0] == "": for k in range(num_comp): line = f.readline() fields = line.split() - assert(len(fields) == x_dim or len(fields) == x_dim+1) - eta2[k] = [ -0.5*float(v) for v in fields[:x_dim] ] - if k == num_comp-1: + assert len(fields) == x_dim or len(fields) == x_dim + 1 + eta2[k] = [-0.5 * float(v) for v in fields[:x_dim]] + if k == num_comp - 1: success = True - assert success + assert success eta = np.hstack((eta1, eta2)) return cls(x_dim=x_dim, pi=pi, eta=eta) - def _validate_mu(self): - assert(self.mu.shape[0] == self.num_comp) - assert(self.mu.shape[1] == self.x_dim) - + assert self.mu.shape[0] == self.num_comp + assert self.mu.shape[1] == self.x_dim - def _validate_Lambda(self): assert self.Lambda.shape[0] == self.num_comp assert self.Lambda.shape[1] == self.x_dim assert np.all(self.Lambda > 0) - - def _validate_eta(self): assert self.eta.shape[0] == self.num_comp - assert self.eta.shape[1] == self.x_dim*2 - + assert self.eta.shape[1] == self.x_dim * 2 - def validate(self): if self.pi is not None: self._validate_pi() - + if self.mu is not None and self.Lambda is not None: self._validate_mu() self._validate_Lambda() - + if self.eta is not None: self._validate_eta() - - @staticmethod def compute_eta(mu, Lambda): - Lmu = Lambda*mu - eta = np.hstack((Lmu, -0.5*Lambda)) + Lmu = Lambda * mu + eta = np.hstack((Lmu, -0.5 * Lambda)) return eta - @staticmethod def compute_std(eta): - x_dim = int(eta.shape[-1]/2) - eta1 = eta[:,:x_dim] - eta2 = eta[:,x_dim:] - mu = -0.5*eta1/eta2 - Lambda = -2*eta2 + x_dim = int(eta.shape[-1] / 2) + eta1 = eta[:, :x_dim] + eta2 = eta[:, x_dim:] + mu = -0.5 * eta1 / eta2 + Lambda = -2 * eta2 return mu, Lambda - @staticmethod def compute_A_nat(eta): - x_dim = int(eta.shape[-1]/2) - eta1 = eta[:,:x_dim] - eta2 = eta[:,x_dim:] - r1 = 0.5 * x_dim*np.log(2*np.pi) - r2 = -1/4 * np.sum(eta1*eta1/eta2, axis=-1) - r3 = -1/2 * np.sum(np.log(-2*eta2), axis=-1) + x_dim = int(eta.shape[-1] / 2) + eta1 = eta[:, :x_dim] + eta2 = eta[:, x_dim:] + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -1 / 4 * np.sum(eta1 * eta1 / eta2, axis=-1) + r3 = -1 / 2 * np.sum(np.log(-2 * eta2), axis=-1) return r1 + r2 + r3 - - @staticmethod def compute_A_std(mu, Lambda): x_dim = mu.shape[1] - r1 = 0.5*x_dim*np.log(2*np.pi) - r2 = -0.5*np.sum(np.log(Lambda), axis=-1) - r3 = 0.5*np.sum(mu*mu*Lambda, axis=-1) + r1 = 0.5 * x_dim * np.log(2 * np.pi) + r2 = -0.5 * np.sum(np.log(Lambda), axis=-1) + r3 = 0.5 * np.sum(mu * mu * Lambda, axis=-1) return r1 + r2 + r3 - - def _compute_nat_params(self): self.eta = self.compute_eta(self.mu, self.Lambda) self.A = self.compute_A_nat(self.eta) - - def _compute_std_params(self): self.mu, self.Lambda = self.compute_std(self.eta) self._cholLambda = None self._logLambda = None self._Sigma = None - - @staticmethod def compute_suff_stats(x): d = x.shape[-1] - u = np.zeros((x.shape[0],2*d), dtype=float_cpu()) - u[:,:d] = x - u[:,d:] = x*x + u = np.zeros((x.shape[0], 2 * d), dtype=float_cpu()) + u[:, :d] = x + u[:, d:] = x * x return u - - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] - C=1/self.Lambda[:,feat_idx] + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): plot_gaussian_1D(mu[k], C[k], num_sigmas, num_pts, **kwargs) - - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] - C=1/self.Lambda[:,feat_idx] + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): - C_k=np.diag(C[k]) - plot_gaussian_ellipsoid_2D( - mu[k], C_k, num_sigmas, num_pts, **kwargs) + C_k = np.diag(C[k]) + plot_gaussian_ellipsoid_2D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] - C=1/self.Lambda[:,feat_idx] + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): - C_k=np.diag(C[k]) + C_k = np.diag(C[k]) plot_gaussian_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) - - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, - **kwargs): - mu=self.mu[:,feat_idx] - C=1/self.Lambda[:,feat_idx] + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[:, feat_idx] for k in range(mu.shape[0]): - C_k=np.diag(C[k]) - plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, - **kwargs) - + C_k = np.diag(C[k]) + plot_gaussian_ellipsoid_3D(mu[k], C_k, num_sigmas, num_pts, **kwargs) DiagGMM = GMMDiagCov diff --git a/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py b/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py index 50469ded..a3e7f93e 100644 --- a/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py +++ b/hyperion/pdfs/mixtures/gmm_tied_diag_cov.py @@ -8,23 +8,36 @@ from ...hyp_defs import float_cpu from ...utils.math import softmax, logsumexp -from ...utils.plotting import plot_gaussian_1D, plot_gaussian_ellipsoid_2D, plot_gaussian_ellipsoid_3D, plot_gaussian_3D +from ...utils.plotting import ( + plot_gaussian_1D, + plot_gaussian_ellipsoid_2D, + plot_gaussian_ellipsoid_3D, + plot_gaussian_3D, +) from ...clustering import KMeans from .gmm_diag_cov import GMMDiagCov class GMMTiedDiagCov(GMMDiagCov): - - def __init__(self, mu=None, Lambda=None, var_floor=1e-3, - update_mu=True, update_Lambda=True, - **kwargs): + def __init__( + self, + mu=None, + Lambda=None, + var_floor=1e-3, + update_mu=True, + update_Lambda=True, + **kwargs + ): super().__init__( - mu=mu, Lambda=Lambda, var_floor=var_floor, - update_mu=update_mu, update_Lambda=update_Lambda, - **kwargs) - - + mu=mu, + Lambda=Lambda, + var_floor=var_floor, + update_mu=update_mu, + update_Lambda=update_Lambda, + **kwargs + ) + def _compute_gmm_nat_std(self): if self.mu is not None and self.Lambda is not None: self._validate_mu() @@ -34,19 +47,17 @@ def _compute_gmm_nat_std(self): self._validate_eta() self.A = self.compute_A_nat(self.eta) self._compute_std_params() - - + def _initialize_stdnormal(self): self.pi = np.array([1], dtype=float_cpu()) self.mu = np.zeros((1, self.x_dim), dtype=float_cpu()) self.Lambda = np.ones((self.x_dim,), dtype=float_cpu()) - def _initialize_kmeans(self, num_comp, x): - if num_comp==1: - self.pi=np.array([1], dtype=float_cpu()) - self.mu=np.mean(x, axis=0, keepdims=True) - self.Lambda=1/np.std(x, axis=0, keepdims=True)**2 + if num_comp == 1: + self.pi = np.array([1], dtype=float_cpu()) + self.mu = np.mean(x, axis=0, keepdims=True) + self.Lambda = 1 / np.std(x, axis=0, keepdims=True) ** 2 return kmeans = KMeans(num_clusters=num_comp) @@ -56,27 +67,26 @@ def _initialize_kmeans(self, num_comp, x): self.pi = np.zeros((self.num_comp,), dtype=float_cpu()) C = np.zeros((x.shape[-1],), dtype=float_cpu()) for k in range(num_comp): - r=cluster_index==k - self.pi[k] = np.sum(r)/x.shape[0] + r = cluster_index == k + self.pi[k] = np.sum(r) / x.shape[0] delta = x[r] - self.mu[k] - C += np.sum(delta**2, axis=0) - - self.Lambda = x.shape[0]/C - - + C += np.sum(delta ** 2, axis=0) + + self.Lambda = x.shape[0] / C + def Mstep(self, N, u_x): F, S = self.unstack_suff_stats(u_x) if self.update_mu: - self.mu = F/N[:, None] + self.mu = F / N[:, None] if self.update_Lambda: - S = S/N[:,None] - self.mu**2 - S_floor = self.var_floor * np.mean(S[N>self.min_N], axis=0) + S = S / N[:, None] - self.mu ** 2 + S_floor = self.var_floor * np.mean(S[N > self.min_N], axis=0) S = np.maximum(S, S_floor) - Spool = np.sum(N[:, None]*S, axis=0)/np.sum(N) - self.Lambda=1/Spool + Spool = np.sum(N[:, None] * S, axis=0) / np.sum(N) + self.Lambda = 1 / Spool self._Sigma = Spool self._cholLambda = None self._logLambda = None @@ -87,117 +97,102 @@ def Mstep(self, N, u_x): N[N0] = 0 mu[N0] = 0 S[N0] = 1 - self.pi = N/np.sum(N) + self.pi = N / np.sum(N) self._log_pi = None - + self._compute_nat_params() - def split_comp(self, K=2): - std_dev = 1/self.cholLambda + std_dev = 1 / self.cholLambda num_comp = self.num_comp * K - pi = np.repeat(self.pi, K)/K + pi = np.repeat(self.pi, K) / K mu = np.repeat(self.mu, K, axis=0) - - if K==2: + + if K == 2: mu[::2] += std_dev mu[1::2] -= std_dev else: for k in range(K): - factor = 2*(np.random.uniform(size=std_dev.shape) > 0.5) - 1 - mu[k::K] += factor*std_dev + factor = 2 * (np.random.uniform(size=std_dev.shape) > 0.5) - 1 + mu[k::K] += factor * std_dev config = self.get_config() return DiagGMMTiedCov(pi=pi, mu=mu, Lambda=self.Lambda, **config) - def log_prob_std(self, x): - r0 = self.log_pi + 0.5*self.logLambda-0.5*self.x_dim*np.log(2*np.pi) + r0 = self.log_pi + 0.5 * self.logLambda - 0.5 * self.x_dim * np.log(2 * np.pi) llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): - mah_dist2 = np.sum(((x-self.mu[k])*self.cholLambda)**2, axis=-1) - llk_k[:,k] = r0[k] - 0.5*mah_dist2 - return logsumexp(llk_k, axis=-1) - + mah_dist2 = np.sum(((x - self.mu[k]) * self.cholLambda) ** 2, axis=-1) + llk_k[:, k] = r0[k] - 0.5 * mah_dist2 + return logsumexp(llk_k, axis=-1) def log_cdf(self, x): llk_k = np.zeros((x.shape[0], self.num_comp), dtype=float_cpu()) for k in range(self.num_comp): - delta = (x-self.mu[k])*self.cholLambda - lk = 0.5*(1+erf(delta/np.sqrt(2))) - llk_k[:,k] = self.log_pi[k] + np.sum(np.log(lk+1e-20), axis=-1) + delta = (x - self.mu[k]) * self.cholLambda + lk = 0.5 * (1 + erf(delta / np.sqrt(2))) + llk_k[:, k] = self.log_pi[k] + np.sum(np.log(lk + 1e-20), axis=-1) return logsumexp(llk_k) - def sample(self, num_samples, rng=None, seed=1024): if rng is None: - rng=np.random.RandomState(seed) + rng = np.random.RandomState(seed) r = rng.multinomial(1, self.pi, size=(num_samples,)) x = rng.normal(size=(num_samples, self.x_dim)).astype(float_cpu()) for k in range(self.num_comp): - index = r[:, k]==1 - x[index] = 1./self.cholLambda*x[index] + self.mu[k] - - return x + index = r[:, k] == 1 + x[index] = 1.0 / self.cholLambda * x[index] + self.mu[k] + return x def _validate_Lambda(self): - assert(self.Lambda.shape[0] == self.x_dim) - assert(np.all(self.Lambda > 0)) - + assert self.Lambda.shape[0] == self.x_dim + assert np.all(self.Lambda > 0) @staticmethod def compute_eta(mu, Lambda): - Lmu = Lambda*mu - eta = np.hstack((Lmu, -0.5*np.tile(Lambda, (mu.shape[0],1)))) + Lmu = Lambda * mu + eta = np.hstack((Lmu, -0.5 * np.tile(Lambda, (mu.shape[0], 1)))) return eta - @staticmethod def compute_std(eta): - x_dim = int(eta.shape[-1]/2) - eta1 = eta[:,:x_dim] - eta2 = eta[:,x_dim:] - mu = -0.5*eta1/eta2 - Lambda = -2*eta2[0] + x_dim = int(eta.shape[-1] / 2) + eta1 = eta[:, :x_dim] + eta2 = eta[:, x_dim:] + mu = -0.5 * eta1 / eta2 + Lambda = -2 * eta2[0] return mu, Lambda - def plot1D(self, feat_idx=0, num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] - C=1/self.Lambda[feat_idx] + mu = self.mu[:, feat_idx] + C = 1 / self.Lambda[feat_idx] for k in range(mu.shape[0]): plot_gaussian_1D(mu[k], C, num_sigmas, num_pts, **kwargs) - def plot2D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] - C=np.diag(1/self.Lambda[feat_idx]) + mu = self.mu[:, feat_idx] + C = np.diag(1 / self.Lambda[feat_idx]) for k in range(mu.shape[0]): - plot_gaussian_ellipsoid_2D( - mu[k], C, num_sigmas, num_pts, **kwargs) - + plot_gaussian_ellipsoid_2D(mu[k], C, num_sigmas, num_pts, **kwargs) def plot3D(self, feat_idx=[0, 1], num_sigmas=2, num_pts=100, **kwargs): - mu=self.mu[:,feat_idx] - C=np.diag(1/self.Lambda[feat_idx]) + mu = self.mu[:, feat_idx] + C = np.diag(1 / self.Lambda[feat_idx]) for k in range(mu.shape[0]): plot_gaussian_3D(mu[k], C, num_sigmas, num_pts, **kwargs) - - def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, - **kwargs): - mu=self.mu[:,feat_idx] - C=np.diag(1/self.Lambda[feat_idx]) + def plot3D_ellipsoid(self, feat_idx=[0, 1, 2], num_sigmas=2, num_pts=100, **kwargs): + mu = self.mu[:, feat_idx] + C = np.diag(1 / self.Lambda[feat_idx]) for k in range(mu.shape[0]): - plot_gaussian_ellipsoid_3D(mu[k], C, num_sigmas, num_pts, - **kwargs) - + plot_gaussian_ellipsoid_3D(mu[k], C, num_sigmas, num_pts, **kwargs) DiagGMMTiedCov = GMMTiedDiagCov diff --git a/hyperion/pdfs/plda/__init__.py b/hyperion/pdfs/plda/__init__.py index 017e419c..9d11ad38 100644 --- a/hyperion/pdfs/plda/__init__.py +++ b/hyperion/pdfs/plda/__init__.py @@ -8,6 +8,3 @@ from .frplda import FRPLDA from .splda import SPLDA from .plda import PLDA - - - diff --git a/hyperion/pdfs/plda/frplda.py b/hyperion/pdfs/plda/frplda.py index 4ba309ca..5ea628fe 100644 --- a/hyperion/pdfs/plda/frplda.py +++ b/hyperion/pdfs/plda/frplda.py @@ -11,11 +11,18 @@ from .plda_base import PLDABase - class FRPLDA(PLDABase): - - def __init__(self, mu=None, B=None, W=None, fullcov_W=True, - update_mu=True, update_B=True, update_W=True, **kwargs): + def __init__( + self, + mu=None, + B=None, + W=None, + fullcov_W=True, + update_mu=True, + update_B=True, + update_W=True, + **kwargs + ): super(FRPLDA, self).__init__(mu=mu, update_mu=update_mu, **kwargs) if mu is not None: self.y_dim = mu.shape[0] @@ -25,27 +32,21 @@ def __init__(self, mu=None, B=None, W=None, fullcov_W=True, self.update_B = update_B self.update_W = update_W - - def validate(self): assert self.mu.shape[0] == self.B.shape[0] assert self.mu.shape[0] == self.B.shape[1] assert self.mu.shape[0] == self.W.shape[0] assert self.mu.shape[0] == self.W.shape[1] - @property def is_init(self): if self._is_init: return True - if (self.mu is not None and self.B is not None and - self.W is not None): + if self.mu is not None and self.B is not None and self.W is not None: self.validate() self._is_init = True return self._is_init - - def initialize(self, D): N, F, S = D self.x_dim = F.shape[1] @@ -53,17 +54,17 @@ def initialize(self, D): M = F.shape[0] N_tot = np.sum(N) - y = F/N[:,None] + y = F / N[:, None] Fy = np.dot(F.T, y) C = S - Fy - Fy.T for i in range(M): - yy = np.outer(y[i,:], y[i,:]) + yy = np.outer(y[i, :], y[i, :]) C += N[i] * yy - C = (C+C.T)/2 + C = (C + C.T) / 2 mu = np.mean(y, axis=0) - iB = np.dot(y.T, y)/M - np.outer(mu, mu) - iW = C/N_tot + iB = np.dot(y.T, y) / M - np.outer(mu, mu) + iW = C / N_tot B = invert_pdmat(iB, return_inv=True)[-1] W = invert_pdmat(iW, return_inv=True)[-1] @@ -73,18 +74,17 @@ def initialize(self, D): self.W = W self._is_init = True + def compute_py_g_x( + self, D, return_cov=False, return_logpy_0=False, return_acc=False + ): - - def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, - return_acc=False): - assert self.is_init - + N, F, S = D - M=F.shape[0] + M = F.shape[0] y_dim = self.y_dim - assert(y_dim == F.shape[1]) + assert y_dim == F.shape[1] compute_inv = return_cov or return_acc return_tuple = compute_inv or return_logpy_0 @@ -92,7 +92,7 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, N_is_int = False if np.all(np.ceil(N) == N): N_is_int = True - + gamma = np.dot(F, self.W) + np.dot(self.mu, self.B) if N_is_int: iterator = np.unique(N) @@ -104,9 +104,9 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, Sigma_y = np.zeros((M, y_dim, y_dim), dtype=float_cpu()) else: Sigma_y = None - + if return_logpy_0: - logpy = - 0.5*y_dim*np.log(2*np.pi) * np.ones((M,), dtype=float_cpu()) + logpy = -0.5 * y_dim * np.log(2 * np.pi) * np.ones((M,), dtype=float_cpu()) if return_acc: Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) @@ -121,32 +121,35 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, i = k N_i = N[k] M_i = 1 - - L_i = self.B + N_i*self.W - - r = invert_pdmat(L_i, right_inv=True, - return_logdet=return_logpy_0, - return_inv=compute_inv) + + L_i = self.B + N_i * self.W + + r = invert_pdmat( + L_i, + right_inv=True, + return_logdet=return_logpy_0, + return_inv=compute_inv, + ) mult_iL = r[0] if return_logpy_0: logL = r[2] if compute_inv: iL = r[-1] - - y[i,:]=mult_iL(gamma[i,:]) - + + y[i, :] = mult_iL(gamma[i, :]) + if return_cov: - Sigma_y[i,:,:]=iL + Sigma_y[i, :, :] = iL if return_logpy_0: - logpy[i] += 0.5*(logL - np.sum(y[i,:]*gamma[i,:], axis=-1)) - + logpy[i] += 0.5 * (logL - np.sum(y[i, :] * gamma[i, :], axis=-1)) + if return_acc: - Py += M_i*iL + Py += M_i * iL if not return_tuple: return y - + r = [y] if return_cov: r += [Sigma_y] @@ -156,41 +159,46 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, r += [Ry, Py] return r - - def Estep(self, D): N, F, S = D - y, logpy, Ry, Py = self.compute_py_g_x( - D, return_logpy_0=True, return_acc=True) + y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) - M=F.shape[0] - N_tot=np.sum(N) + M = F.shape[0] + N_tot = np.sum(N) y_acc = np.sum(y, axis=0) Cy = np.dot(F.T, y) - - Niy = y * N[:,None] + + Niy = y * N[:, None] Ry += np.dot(Niy.T, y) Py += np.dot(y.T, y) logpy_acc = np.sum(logpy) - + stats = (N_tot, M, S, logpy_acc, y_acc, Ry, Cy, Py) return stats - - def elbo(self, stats): - N, M, S, logpy_x = stats[:4] + N, M, S, logpy_x = stats[:4] logW = logdet_pdmat(self.W) logB = logdet_pdmat(self.B) - logpx_y = 0.5*(- N*self.x_dim*np.log(2*np.pi) + N*logW - - np.inner(self.W.ravel(), S.ravel())) - logpy = 0.5*M*(- self.y_dim*np.log(2*np.pi) + logB - -np.inner(np.dot(self.mu, self.B), self.mu)) - + logpx_y = 0.5 * ( + -N * self.x_dim * np.log(2 * np.pi) + + N * logW + - np.inner(self.W.ravel(), S.ravel()) + ) + logpy = ( + 0.5 + * M + * ( + -self.y_dim * np.log(2 * np.pi) + + logB + - np.inner(np.dot(self.mu, self.B), self.mu) + ) + ) + elbo = logpx_y + logpy - logpy_x return elbo # N, M, sumy, yy, _, _, CW, logL = stats @@ -199,170 +207,170 @@ def elbo(self, stats): # logW = logdet_pdmat(self.W) # logB = logdet_pdmat(self.B) - + # elbo = 0.5*(-logL - N*self.x_dim*np.log(2*np.pi) # +N*logW - np.inner(self.W.ravel(), CW.ravel()) # +M*logB - np.inner(self.B.ravel(), CB.ravel())) # return elbo - - def MstepML(self, stats): N, M, S, _, y_acc, Ry, Cy, Py = stats - ybar = y_acc/M + ybar = y_acc / M if self.update_mu: self.mu = ybar if self.update_B: if self.update_mu: - iB = Py/M - np.outer(self.mu, self.mu) + iB = Py / M - np.outer(self.mu, self.mu) else: muybar = np.outer(self.mu, ybar) - iB = Py/M - muybar - muybar + np.outer(self.mu, self.mu) + iB = Py / M - muybar - muybar + np.outer(self.mu, self.mu) self.B = invert_pdmat(iB, return_inv=True)[-1] if self.update_W: - iW = (S - Cy - Cy.T + Ry)/N + iW = (S - Cy - Cy.T + Ry) / N if self.fullcov_W: self.W = invert_pdmat(iW, return_inv=True)[-1] else: - self.W=np.diag(1/np.diag(iW)) + self.W = np.diag(1 / np.diag(iW)) - - def MstepMD(self, stats): pass - - def get_config(self): - config = { 'update_W': self.update_W, - 'update_B': self.update_B, - 'fullcov_W': self.fullcov_W} + config = { + "update_W": self.update_W, + "update_B": self.update_B, + "fullcov_W": self.fullcov_W, + } base_config = super(FRPLDA, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = { 'mu': self.mu, - 'B': self.B, - 'W': self.W} + params = {"mu": self.mu, "B": self.B, "W": self.W} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'B', 'W'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["mu", "B", "W"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - def llr_1vs1(self, x1, x2): assert self.is_init - + Lnon = self.B + self.W mult_icholLnon, logcholLnon = invert_trimat( sla.cholesky(Lnon, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logLnon = 2*logcholLnon + right_inv=True, + return_logdet=True, + )[:2] + logLnon = 2 * logcholLnon - Ltar = self.B + 2*self.W + Ltar = self.B + 2 * self.W mult_icholLtar, logcholLtar = invert_trimat( sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logLtar = 2*logcholLtar + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar WF1 = np.dot(x1, self.W) WF2 = np.dot(x2, self.W) Bmu = np.dot(self.mu, self.B) - gamma_non_1 = mult_icholLnon(WF1+Bmu) - gamma_non_2 = mult_icholLnon(WF2+Bmu) + gamma_non_1 = mult_icholLnon(WF1 + Bmu) + gamma_non_2 = mult_icholLnon(WF2 + Bmu) - Qnon_1 = np.sum(gamma_non_1*gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2*gamma_non_2, axis=1) + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) - gamma_tar_1 = mult_icholLtar(WF1+0.5*Bmu) - gamma_tar_2 = mult_icholLtar(WF2+0.5*Bmu) + gamma_tar_1 = mult_icholLtar(WF1 + 0.5 * Bmu) + gamma_tar_2 = mult_icholLtar(WF2 + 0.5 * Bmu) - Qtar_1 = np.sum(gamma_tar_1*gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2*gamma_tar_2, axis=1) + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) - scores = 2*np.dot(gamma_tar_1, gamma_tar_2.T) - scores += (Qtar_1-Qnon_1+Qtar_2-Qnon_2) - scores += (2*logLnon-logLtar - -logdet_pdmat(self.B) - +np.inner(np.dot(self.mu, self.B), self.mu)) + scores = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores += ( + 2 * logLnon + - logLtar + - logdet_pdmat(self.B) + + np.inner(np.dot(self.mu, self.B), self.mu) + ) scores *= 0.5 return scores - - def llr_NvsM_book(self, D1, D2): assert self.is_init - + N1, F1, _ = D1 N2, F2, _ = D2 - + Bmu = np.dot(self.mu, self.B) - + scores = np.zeros((len(N1), len(N2)), dtype=float_cpu()) for N1_i in np.unique(N1): for N2_j in np.unique(N2): i = np.where(N1 == N1_i)[0] j = np.where(N2 == N2_j)[0] - L1 = self.B + N1_i*self.W + L1 = self.B + N1_i * self.W mult_icholL1, logcholL1 = invert_trimat( sla.cholesky(L1, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logL1 = 2*logcholL1 + right_inv=True, + return_logdet=True, + )[:2] + logL1 = 2 * logcholL1 - L2 = self.B + N2_j*self.W + L2 = self.B + N2_j * self.W mult_icholL2, logcholL2 = invert_trimat( sla.cholesky(L2, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logL2 = 2*logcholL2 + right_inv=True, + return_logdet=True, + )[:2] + logL2 = 2 * logcholL2 - Ltar = self.B + (N1_i + N2_j)*self.W + Ltar = self.B + (N1_i + N2_j) * self.W mult_icholLtar, logcholLtar = invert_trimat( sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logLtar = 2*logcholLtar - - WF1 = np.dot(F1[i,:], self.W) - WF2 = np.dot(F2[j,:], self.W) - + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar + + WF1 = np.dot(F1[i, :], self.W) + WF2 = np.dot(F2[j, :], self.W) + gamma_non_1 = mult_icholL1(WF1 + Bmu) gamma_non_2 = mult_icholL2(WF2 + Bmu) - - Qnon_1 = np.sum(gamma_non_1*gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2*gamma_non_2, axis=1) - - gamma_tar_1 = mult_icholLtar(WF1 + 0.5*Bmu) - gamma_tar_2 = mult_icholLtar(WF2 + 0.5*Bmu) - - Qtar_1 = np.sum(gamma_tar_1*gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2*gamma_tar_2, axis=1) - - scores_ij = 2*np.dot(gamma_tar_1, gamma_tar_2.T) - scores_ij += (Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2) - scores_ij += (logL1 + logL2 - logLtar) - scores[np.ix_(i,j)] = scores_ij - - scores += (-logdet_pdmat(self.B) + np.inner(np.dot(self.mu, self.B), self.mu)) + + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) + + gamma_tar_1 = mult_icholLtar(WF1 + 0.5 * Bmu) + gamma_tar_2 = mult_icholLtar(WF2 + 0.5 * Bmu) + + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) + + scores_ij = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores_ij += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores_ij += logL1 + logL2 - logLtar + scores[np.ix_(i, j)] = scores_ij + + scores += -logdet_pdmat(self.B) + np.inner(np.dot(self.mu, self.B), self.mu) scores *= 0.5 return scores - - def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024, return_y=False): - + def sample( + self, num_classes, num_samples_per_class, rng=None, seed=1024, return_y=False + ): + assert self.is_init - + if rng is None: rng = np.random.RandomState(seed=seed) @@ -372,35 +380,31 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024, return chol_Sw = sla.cholesky(Sw, lower=False) x_dim = self.mu.shape[0] - z = rng.normal(size=(num_classes*num_samples_per_class, x_dim)).astype( - dtype=float_cpu(), copy=False) + z = rng.normal(size=(num_classes * num_samples_per_class, x_dim)).astype( + dtype=float_cpu(), copy=False + ) z = np.dot(z, chol_Sw) - y = rng.normal(size=(num_classes, x_dim)).astype( - dtype=float_cpu(), copy=False) + y = rng.normal(size=(num_classes, x_dim)).astype(dtype=float_cpu(), copy=False) y = np.dot(y, chol_Sb) + self.mu y = np.repeat(y, num_samples_per_class, axis=0) if return_y: return y + z, y - + return y + z - - + def weighted_avg_params(self, mu, B, W, w_mu, w_B, w_W): super(FRPLDA, self).weigthed_avg_params(mu, w_mu) if w_B > 0: Sb0 = invert_pdmat(self.B, return_inv=True)[-1] Sb = invert_pdmat(B, return_inv=True)[-1] - Sb = w_B*Sb + (1-w_B)*Sb0 + Sb = w_B * Sb + (1 - w_B) * Sb0 self.B = invert_pdmat(Sb, return_inv=True)[-1] if w_W > 0: Sw0 = invert_pdmat(self.W, return_inv=True)[-1] Sw = invert_pdmat(W, return_inv=True)[-1] - Sw = w_W*Sw + (1-w_W)*Sw0 + Sw = w_W * Sw + (1 - w_W) * Sw0 self.W = invert_pdmat(Sw, return_inv=True)[-1] - def weighted_avg_model(self, plda, w_mu, w_B, w_W): self.weighted_avg_params(plda.mu, plda.B, plda.W, w_mu, w_B, w_W) - - diff --git a/hyperion/pdfs/plda/plda.py b/hyperion/pdfs/plda/plda.py index 83d8e61f..16dee5ea 100644 --- a/hyperion/pdfs/plda/plda.py +++ b/hyperion/pdfs/plda/plda.py @@ -12,9 +12,21 @@ class PLDA(PLDABase): - - def __init__(self, y_dim=None, z_dim=None, mu=None, V=None, U=None, D=None, floor_iD=1e-5, - update_mu=True, update_V=True, update_U=True, update_D=True, **kwargs): + def __init__( + self, + y_dim=None, + z_dim=None, + mu=None, + V=None, + U=None, + D=None, + floor_iD=1e-5, + update_mu=True, + update_V=True, + update_U=True, + update_D=True, + **kwargs + ): super(PLDA, self).__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) self.z_dim = z_dim if V is not None: @@ -38,8 +50,6 @@ def __init__(self, y_dim=None, z_dim=None, mu=None, V=None, U=None, D=None, floo self._W = None self._VW = None self._VWV = None - - def validate(self): assert self.mu.shape[0] >= self.V.shape[0] @@ -48,77 +58,75 @@ def validate(self): assert self.mu.shape[0] == self.U.shape[1] assert self.mu.shape[0] == self.D.shape[0] - - @property def is_init(self): if self._is_init: return True - if (self.mu is not None and self.V is not None and - self.U is not None and self.D is not None): + if ( + self.mu is not None + and self.V is not None + and self.U is not None + and self.D is not None + ): self.validate() if self._VWV is None: self.compute_aux() self._is_init = True return self._is_init - - def compute_aux(self): - DV = self.V*self.D - DU = self.U*self.D + DV = self.V * self.D + DU = self.U * self.D self._DU = DU self._J = np.dot(self.V, DU.T) self._Lz = np.eye(self.z_dim, dtype=float_cpu()) + np.dot(DU, self.U.T) self._mult_iLz, _, self._log_Lz = invert_pdmat( - self._Lz, right_inv=True, return_logdet=True) + self._Lz, right_inv=True, return_logdet=True + ) DUiLz = self._mult_iLz(DU.T) self._W = np.diag(self.D) - np.dot(DUiLz, DU) self._VW = DV.T - np.dot(DUiLz, self._J.T) self._VWV = np.dot(self.V, self._VW) - - def initialize(self, D): N, F, S = D self.x_dim = F.shape[1] M = F.shape[0] N_tot = np.sum(N) - Vytilde = F/N[:,None] + Vytilde = F / N[:, None] mu = np.mean(Vytilde, axis=0) Vy = Vytilde - mu U, s, Vt = sla.svd(Vy, full_matrices=False, overwrite_a=True) - V = s[:self.y_dim,None]*Vt[:self.y_dim,:] - - NVytilde = N[:, None]*Vytilde - C = (S - np.dot(NVytilde.T, Vytilde))/N_tot + V = s[: self.y_dim, None] * Vt[: self.y_dim, :] + + NVytilde = N[:, None] * Vytilde + C = (S - np.dot(NVytilde.T, Vytilde)) / N_tot w, U = sla.eigh(C) - U = np.fliplr(U*np.sqrt(w))[:,:self.z_dim].T + U = np.fliplr(U * np.sqrt(w))[:, : self.z_dim].T iD = np.diag(C - np.dot(U.T, U)).copy() - iD[iD 0: Sb0 = np.dot(self.V.T, self.V) Sb = np.dot(V.T, V) - Sb = w_B*Sb + (1-w_B)*Sb0 + Sb = w_B * Sb + (1 - w_B) * Sb0 w, V = sla.eigh(Sb, overwrite_a=True) - V = np.sqrt(w)*V - V = V[:,-self.y_dim:] + V = np.sqrt(w) * V + V = V[:, -self.y_dim :] self.V = V.T if w_W > 0: - Sw0 = np.dot(self.U.T, self.U) + np.diag(1/self.D) - Sw = np.dot(U.T, U) + np.diag(1/D) - Sw = w_W*Sw + (1-w_W)*Sw0 + Sw0 = np.dot(self.U.T, self.U) + np.diag(1 / self.D) + Sw = np.dot(U.T, U) + np.diag(1 / D) + Sw = w_W * Sw + (1 - w_W) * Sw0 w, U = sla.eigh(Sw, overwrite_a=False) - U = np.sqrt(w)*U - U = U[:,-self.z_dim:] + U = np.sqrt(w) * U + U = U[:, -self.z_dim :] self.U = U.T iD = np.diag(Sw - np.dot(self.U.T, self.U)).copy() # print(Sw[:10,:10]) # print(np.dot(self.U.T, self.U)) # print(iD[:10]) - iD[iD 0: # Sw0 = np.dot(self.U.T, self.U) @@ -582,7 +603,5 @@ def weighted_avg_params(self, mu, V, U, D, w_mu, w_B, w_W): # Sd = w_D*Sd + (1-w_D)*Sd0 # self.D = 1/Sd - def weighted_avg_model(self, plda, w_mu, w_B, w_W): self.weighted_avg_params(plda.mu, plda.V, plda.U, plda.D, w_mu, w_B, w_W) - diff --git a/hyperion/pdfs/plda/splda.py b/hyperion/pdfs/plda/splda.py index 8cc9b9da..1ffaaa1c 100644 --- a/hyperion/pdfs/plda/splda.py +++ b/hyperion/pdfs/plda/splda.py @@ -11,9 +11,18 @@ class SPLDA(PLDABase): - - def __init__(self, y_dim=None, mu=None, V=None, W=None, fullcov_W=True, - update_mu=True, update_V=True, update_W=True, **kwargs): + def __init__( + self, + y_dim=None, + mu=None, + V=None, + W=None, + fullcov_W=True, + update_mu=True, + update_V=True, + update_W=True, + **kwargs + ): super().__init__(y_dim=y_dim, mu=mu, update_mu=update_mu, **kwargs) if V is not None: self.y_dim = V.shape[0] @@ -23,57 +32,50 @@ def __init__(self, y_dim=None, mu=None, V=None, W=None, fullcov_W=True, self.update_V = update_V self.update_W = update_W - - def validate(self): assert self.mu.shape[0] >= self.V.shape[0] assert self.mu.shape[0] == self.V.shape[1] assert self.mu.shape[0] == self.W.shape[0] assert self.mu.shape[0] == self.W.shape[1] - - @property def is_init(self): if self._is_init: return True - if (self.mu is not None and self.V is not None and - self.W is not None): + if self.mu is not None and self.V is not None and self.W is not None: self.validate() self._is_init = True return self._is_init - - def initialize(self, D): N, F, S = D self.x_dim = F.shape[1] M = F.shape[0] N_tot = np.sum(N) - Vytilde = F/N[:,None] + Vytilde = F / N[:, None] mu = np.mean(Vytilde, axis=0) Vy = Vytilde - mu U, s, Vt = sla.svd(Vy, full_matrices=False, overwrite_a=True) - V = s[:self.y_dim,None]*Vt[:self.y_dim,:] - NVytilde = N[:, None]*Vytilde - C = (S - np.dot(NVytilde.T, Vytilde))/N_tot + V = s[: self.y_dim, None] * Vt[: self.y_dim, :] + NVytilde = N[:, None] * Vytilde + C = (S - np.dot(NVytilde.T, Vytilde)) / N_tot if self.fullcov_W: W = invert_pdmat(C, return_inv=True)[-1] else: - W = 1/np.diag(C) - + W = 1 / np.diag(C) + self.mu = mu self.V = V self.W = W - - def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, - return_acc=False): + def compute_py_g_x( + self, D, return_cov=False, return_logpy_0=False, return_acc=False + ): N, F, S = D Fc = F - self.mu - + M = F.shape[0] y_dim = self.y_dim @@ -99,9 +101,9 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, Sigma_y = np.zeros((M, y_dim, y_dim), dtype=float_cpu()) else: Sigma_y = None - + if return_logpy_0: - logpy = - 0.5*y_dim*np.log(2*np.pi) * np.ones((M,), dtype=float_cpu()) + logpy = -0.5 * y_dim * np.log(2 * np.pi) * np.ones((M,), dtype=float_cpu()) if return_acc: Py = np.zeros((y_dim, y_dim), dtype=float_cpu()) @@ -116,33 +118,36 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, i = k N_i = N[k] M_i = 1 - - L_i = I + N_i*VV - r = invert_pdmat(L_i, right_inv=True, - return_logdet=return_logpy_0, - return_inv=compute_inv) + + L_i = I + N_i * VV + r = invert_pdmat( + L_i, + right_inv=True, + return_logdet=return_logpy_0, + return_inv=compute_inv, + ) mult_iL = r[0] if return_logpy_0: logL = r[2] if compute_inv: iL = r[-1] - - y[i,:]=mult_iL(gamma[i,:]) - + + y[i, :] = mult_iL(gamma[i, :]) + if return_cov: - Sigma_y[i,:,:]=iL + Sigma_y[i, :, :] = iL if return_logpy_0: - logpy[i] += 0.5*(logL - np.sum(y[i,:]*gamma[i,:], axis=-1)) - + logpy[i] += 0.5 * (logL - np.sum(y[i, :] * gamma[i, :], axis=-1)) + if return_acc: - Py += M_i*iL - Ry += N_i*M_i*iL + Py += M_i * iL + Ry += N_i * M_i * iL if not return_tuple: return y - + r = [y] if return_cov: r += [Sigma_y] @@ -152,56 +157,55 @@ def compute_py_g_x(self, D, return_cov=False, return_logpy_0=False, r += [Ry, Py] return tuple(r) - def Estep(self, D): N, F, S = D - y, logpy, Ry, Py = self.compute_py_g_x( - D, return_logpy_0=True, return_acc=True) + y, logpy, Ry, Py = self.compute_py_g_x(D, return_logpy_0=True, return_acc=True) - M=F.shape[0] - N_tot=np.sum(N) + M = F.shape[0] + N_tot = np.sum(N) F_tot = np.sum(F, axis=0) y_acc = np.sum(y, axis=0) Cy = np.dot(F.T, y) - - Niy = y * N[:,None] + + Niy = y * N[:, None] Ry1 = np.sum(Niy, axis=0) Ry += np.dot(Niy.T, y) Py += np.dot(y.T, y) logpy_acc = np.sum(logpy) - + stats = (N_tot, M, F_tot, S, logpy_acc, y_acc, Ry1, Ry, Cy, Py) return stats - def elbo(self, stats): - N, M, F, S, logpy_x = stats[:5] + N, M, F, S, logpy_x = stats[:5] logW = logdet_pdmat(self.W) Fmu = np.outer(F, self.mu) - Shat = S - Fmu - Fmu.T + N*np.outer(self.mu, self.mu) + Shat = S - Fmu - Fmu.T + N * np.outer(self.mu, self.mu) + + logpx_y = 0.5 * ( + -N * self.x_dim * np.log(2 * np.pi) + + N * logW + - np.inner(self.W.ravel(), Shat.ravel()) + ) + logpy = -0.5 * M * self.y_dim * np.log(2 * np.pi) - logpx_y = 0.5*(- N*self.x_dim*np.log(2*np.pi) + N*logW - - np.inner(self.W.ravel(), Shat.ravel())) - logpy = -0.5*M*self.y_dim*np.log(2*np.pi) - elbo = logpx_y + logpy - logpy_x return elbo - def MstepML(self, stats): N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats a = np.hstack((Ry, Ry1[:, None])) b = np.hstack((Ry1, N)) - Rytilde = np.vstack((a,b)) + Rytilde = np.vstack((a, b)) Cytilde = np.hstack((Cy, F[:, None])) - + if self.update_mu and not self.update_V: - self.mu = (F - np.dot(Ry1, self.V))/N + self.mu = (F - np.dot(Ry1, self.V)) / N if not self.update_mu and self.update_V: iRy_mult = invert_pdmat(Ry, right_inv=False)[0] @@ -210,110 +214,107 @@ def MstepML(self, stats): if self.update_mu and self.update_V: iRytilde_mult = invert_pdmat(Rytilde, right_inv=False)[0] Vtilde = iRytilde_mult(Cytilde.T) - self.V = Vtilde[:-1,:] - self.mu = Vtilde[-1,:] + self.V = Vtilde[:-1, :] + self.mu = Vtilde[-1, :] if self.update_W: if self.update_mu and self.update_V: - iW = (S - np.dot(Cy, self.V) - np.outer(F, self.mu))/N + iW = (S - np.dot(Cy, self.V) - np.outer(F, self.mu)) / N else: Vtilde = np.vstack((self.V, self.mu)) CVt = np.dot(Cytilde, Vtilde) - iW = (S - CVt - CVt.T + np.dot( - np.dot(Vtilde.T, Rytilde), Vtilde))/N + iW = (S - CVt - CVt.T + np.dot(np.dot(Vtilde.T, Rytilde), Vtilde)) / N if self.fullcov_W: self.W = invert_pdmat(iW, return_inv=True)[-1] else: - self.W = np.diag(1/np.diag(iW)) + self.W = np.diag(1 / np.diag(iW)) - def MstepMD(self, stats): N, M, F, S, _, y_acc, Ry1, Ry, Cy, Py = stats - mu_y = y_acc/M - + mu_y = y_acc / M + if self.update_mu: self.mu += np.dot(mu_y, self.V) if self.update_V: - Cov_y = Py/M - np.outer(mu_y, mu_y) + Cov_y = Py / M - np.outer(mu_y, mu_y) chol_Cov_y = sla.cholesky(Cov_y, lower=False, overwrite_a=True) self.V = np.dot(chol_Cov_y, self.V) - def get_config(self): - config = {'update_W': self.update_W, - 'update_V': self.update_V, - 'fullcov_W': self.fullcov_W} + config = { + "update_W": self.update_W, + "update_V": self.update_V, + "fullcov_W": self.fullcov_W, + } base_config = super(SPLDA, self).get_config() return dict(list(base_config.items()) + list(config.items())) - def save_params(self, f): - params = {'mu': self.mu, - 'V': self.V, - 'W': self.W} + params = {"mu": self.mu, "V": self.V, "W": self.W} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'V', 'W'] - params = cls._load_params_to_dict(f, config['name'], param_list) + param_list = ["mu", "V", "W"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - def log_probx_g_y(self, x, y): logW = logdet_pdmat(self.W) delta = x - self.mu - np.dot(y, self.V) - logp = - x.shape[-1]*np.log(2*np.pi) + logW - np.sum( - np.dot(delta, self.W)*delta, axis=-1) + logp = ( + -x.shape[-1] * np.log(2 * np.pi) + + logW + - np.sum(np.dot(delta, self.W) * delta, axis=-1) + ) logp /= 2 return logp - - def llr_1vs1(self, x1, x2): WV = np.dot(self.W, self.V.T) VV = np.dot(self.V, WV) I = np.eye(self.y_dim, dtype=float_cpu()) - + Lnon = I + VV mult_icholLnon, logcholLnon = invert_trimat( sla.cholesky(Lnon, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logLnon = 2*logcholLnon + right_inv=True, + return_logdet=True, + )[:2] + logLnon = 2 * logcholLnon - Ltar = I + 2*VV + Ltar = I + 2 * VV mult_icholLtar, logcholLtar = invert_trimat( sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logLtar = 2*logcholLtar + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar - VWF1 = np.dot(x1-self.mu, WV) - VWF2 = np.dot(x2-self.mu, WV) + VWF1 = np.dot(x1 - self.mu, WV) + VWF2 = np.dot(x2 - self.mu, WV) gamma_non_1 = mult_icholLnon(VWF1) gamma_non_2 = mult_icholLnon(VWF2) - Qnon_1 = np.sum(gamma_non_1*gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2*gamma_non_2, axis=1) + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) gamma_tar_1 = mult_icholLtar(VWF1) gamma_tar_2 = mult_icholLtar(VWF2) - Qtar_1 = np.sum(gamma_tar_1*gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2*gamma_tar_2, axis=1) + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) - scores = 2*np.dot(gamma_tar_1, gamma_tar_2.T) - scores += (Qtar_1-Qnon_1+Qtar_2-Qnon_2) - scores += (2*logLnon-logLtar) + scores = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores += 2 * logLnon - logLtar scores *= 0.5 return scores - - + def llr_NvsM_book(self, D1, D2): N1, F1, _ = D1 N2, F2, _ = D2 @@ -322,55 +323,60 @@ def llr_NvsM_book(self, D1, D2): VV = np.dot(self.V, WV) I = np.eye(self.y_dim, dtype=float_cpu()) - F1 -= N1[:, None]*self.mu - F2 -= N2[:, None]*self.mu + F1 -= N1[:, None] * self.mu + F2 -= N2[:, None] * self.mu scores = np.zeros((len(N1), len(N2)), dtype=float_cpu()) for N1_i in np.unique(N1): for N2_j in np.unique(N2): i = np.where(N1 == N1_i)[0] j = np.where(N2 == N2_j)[0] - L1 = I + N1_i*VV + L1 = I + N1_i * VV mult_icholL1, logcholL1 = invert_trimat( sla.cholesky(L1, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logL1 = 2*logcholL1 + right_inv=True, + return_logdet=True, + )[:2] + logL1 = 2 * logcholL1 - L2 = I + N2_j*VV + L2 = I + N2_j * VV mult_icholL2, logcholL2 = invert_trimat( sla.cholesky(L2, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logL2 = 2*logcholL2 + right_inv=True, + return_logdet=True, + )[:2] + logL2 = 2 * logcholL2 - Ltar = I + (N1_i + N2_j)*VV + Ltar = I + (N1_i + N2_j) * VV mult_icholLtar, logcholLtar = invert_trimat( sla.cholesky(Ltar, lower=False, overwrite_a=True), - right_inv=True, return_logdet=True)[:2] - logLtar = 2*logcholLtar - - VWF1 = np.dot(F1[i,:], WV) - VWF2 = np.dot(F2[j,:], WV) - + right_inv=True, + return_logdet=True, + )[:2] + logLtar = 2 * logcholLtar + + VWF1 = np.dot(F1[i, :], WV) + VWF2 = np.dot(F2[j, :], WV) + gamma_non_1 = mult_icholL1(VWF1) gamma_non_2 = mult_icholL2(VWF2) - - Qnon_1 = np.sum(gamma_non_1*gamma_non_1, axis=1)[:, None] - Qnon_2 = np.sum(gamma_non_2*gamma_non_2, axis=1) - + + Qnon_1 = np.sum(gamma_non_1 * gamma_non_1, axis=1)[:, None] + Qnon_2 = np.sum(gamma_non_2 * gamma_non_2, axis=1) + gamma_tar_1 = mult_icholLtar(VWF1) gamma_tar_2 = mult_icholLtar(VWF2) - - Qtar_1 = np.sum(gamma_tar_1*gamma_tar_1, axis=1)[:, None] - Qtar_2 = np.sum(gamma_tar_2*gamma_tar_2, axis=1) - - scores_ij = 2*np.dot(gamma_tar_1, gamma_tar_2.T) - scores_ij += (Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2) - scores_ij += (logL1 + logL2 - logLtar) - scores[np.ix_(i,j)] = scores_ij - + + Qtar_1 = np.sum(gamma_tar_1 * gamma_tar_1, axis=1)[:, None] + Qtar_2 = np.sum(gamma_tar_2 * gamma_tar_2, axis=1) + + scores_ij = 2 * np.dot(gamma_tar_1, gamma_tar_2.T) + scores_ij += Qtar_1 - Qnon_1 + Qtar_2 - Qnon_2 + scores_ij += logL1 + logL2 - logLtar + scores[np.ix_(i, j)] = scores_ij + scores *= 0.5 return scores - def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): if rng is None: @@ -380,38 +386,38 @@ def sample(self, num_classes, num_samples_per_class, rng=None, seed=1024): chol_Sw = sla.cholesky(Sw, lower=False) x_dim = self.mu.shape[0] - z = rng.normal(size=(num_classes*num_samples_per_class, x_dim)).astype(dtype=float_cpu(), copy=False) + z = rng.normal(size=(num_classes * num_samples_per_class, x_dim)).astype( + dtype=float_cpu(), copy=False + ) z = np.dot(z, chol_Sw) - y = rng.normal(size=(num_classes, self.y_dim)).astype(dtype=float_cpu(), copy=False) + y = rng.normal(size=(num_classes, self.y_dim)).astype( + dtype=float_cpu(), copy=False + ) y = np.dot(y, self.V) + self.mu y = np.repeat(y, num_samples_per_class, axis=0) return y + z - - def weighted_avg_params(self, mu, V, W, w_mu, w_B, w_W): super(SPLDA, self).weigthed_avg_params(mu, w_mu) if w_B > 0: Sb0 = np.dot(self.V.T, self.V) Sb = np.dot(V.T, V) - Sb = w_B*Sb + (1-w_B)*Sb0 + Sb = w_B * Sb + (1 - w_B) * Sb0 w, V = sla.eigh(Sb, overwrite_a=True) - w = w[-self.y_dim:] - V = np.sqrt(w)*V[:,-self.y_dim:] + w = w[-self.y_dim :] + V = np.sqrt(w) * V[:, -self.y_dim :] self.V = V.T - + if w_W > 0: Sw0 = invert_pdmat(self.W, return_inv=True)[-1] Sw = invert_pdmat(W, return_inv=True)[-1] - Sw = w_W*Sw + (1-w_W)*Sw0 + Sw = w_W * Sw + (1 - w_W) * Sw0 self.W = invert_pdmat(Sw, return_inv=True)[-1] - def weighted_avg_model(self, plda, w_mu, w_B, w_W): self.weighted_avg_params(plda.mu, plda.V, plda.W, w_mu, w_B, w_W) - - + def project(self, T, delta_mu=None): mu = self.mu if mu is not None: @@ -423,4 +429,3 @@ def project(self, T, delta_mu=None): W = invert_pdmat(Sw, return_inv=True)[-1] return SPLDA(mu=mu, V=V, W=W, fullcov_W=True) - diff --git a/hyperion/pipeline/pipeline.py b/hyperion/pipeline/pipeline.py index 4198ffbc..6b8076f5 100644 --- a/hyperion/pipeline/pipeline.py +++ b/hyperion/pipeline/pipeline.py @@ -11,9 +11,10 @@ from ..transforms import * + class Pipeline(HypModel): - """Class to process a series of models. - """ + """Class to process a series of models.""" + def __init__(self, transforms, **kwargs): super(Pipeline, self).__init__(**kwargs) if not isinstance(transforms, list): @@ -22,48 +23,41 @@ def __init__(self, transforms, **kwargs): if transforms is not None: self.update_names() - def append(self, t): self.transforms.append(t) if self.name is not None: - t.name = self.name + '/' + t.name + t.name = self.name + "/" + t.name - def predict(self, x): for t in self.transforms: x = t.predict(x) return x - def update_names(self): if self.name is not None: for t in self.transforms: - t.name = self.name + '/' + t.name + t.name = self.name + "/" + t.name - def get_config(self): config = super(Pipeline, self).get_config() config_t = {} for i in range(len(self.transforms)): config_t[i] = self.transforms[i].get_config() - config['transforms'] = config_t + config["transforms"] = config_t return config - def save_params(self, f): for t in self.transforms: t.save_params(f) - @classmethod def load_params(cls, f, config): - config_ts = config['transforms'] + config_ts = config["transforms"] transforms = [] for i in range(len(config_ts)): config_t = config_ts[str(i)] logging.debug(config_t) - class_t = globals()[config_t['class_name']] + class_t = globals()[config_t["class_name"]] t = class_t.load_params(f, config_t) transforms.append(t) - return cls(transforms, name=config['name']) - + return cls(transforms, name=config["name"]) diff --git a/hyperion/score_norm/__init__.py b/hyperion/score_norm/__init__.py index 2d1b5723..b0eb8000 100644 --- a/hyperion/score_norm/__init__.py +++ b/hyperion/score_norm/__init__.py @@ -10,5 +10,3 @@ from .tz_norm import TZNorm from .s_norm import SNorm from .adapt_s_norm import AdaptSNorm - - diff --git a/hyperion/score_norm/adapt_s_norm.py b/hyperion/score_norm/adapt_s_norm.py index 748bbdb6..3f1a47c7 100644 --- a/hyperion/score_norm/adapt_s_norm.py +++ b/hyperion/score_norm/adapt_s_norm.py @@ -11,16 +11,21 @@ class AdaptSNorm(ScoreNorm): - """Class for adaptive S-Norm - """ + """Class for adaptive S-Norm""" + def __init__(self, nbest=100, nbest_discard=0, **kwargs): super(AdaptSNorm, self).__init__(*kwargs) self.nbest = nbest self.nbest_discard = nbest_discard - - - def predict(self, scores, scores_coh_test, scores_enr_coh, mask_coh_test=None, mask_enr_coh=None): + def predict( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test=None, + mask_enr_coh=None, + ): assert scores_enr_coh.shape[1] == scores_coh_test.shape[0] assert self.nbest_discard < scores_enr_coh.shape[1] @@ -33,50 +38,51 @@ def predict(self, scores, scores_coh_test, scores_enr_coh, mask_coh_test=None, m scores_coh_test[mask_coh_test == False] = 0 if mask_enr_coh is not None: scores_enr_coh[mask_enr_coh == False] = 0 - - best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[self.nbest_discard:self.nbest_discard+nbest] + + best_idx = np.flipud(np.argsort(scores_coh_test, axis=0))[ + self.nbest_discard : self.nbest_discard + nbest + ] scores_z_norm = np.zeros_like(scores) for i in range(scores.shape[1]): - best_idx_i = best_idx[:,i] - - mu_z = np.mean(scores_enr_coh[:, best_idx_i], - axis=1, keepdims=True) + best_idx_i = best_idx[:, i] + + mu_z = np.mean(scores_enr_coh[:, best_idx_i], axis=1, keepdims=True) if mask_enr_coh is None: - s_z = np.std(scores_enr_coh[:, best_idx_i], - axis=1, keepdims=True) + s_z = np.std(scores_enr_coh[:, best_idx_i], axis=1, keepdims=True) else: - norm = np.mean(mask_enr_coh[:, best_idx_i], - axis=1, keepdims=True) + norm = np.mean(mask_enr_coh[:, best_idx_i], axis=1, keepdims=True) mu_z /= norm - s_z = np.sqrt(np.mean(scores_enr_coh[:, best_idx_i]**2, - axis=1, keepdims=True)/norm - mu_z**2) + s_z = np.sqrt( + np.mean(scores_enr_coh[:, best_idx_i] ** 2, axis=1, keepdims=True) + / norm + - mu_z ** 2 + ) s_z = np.clip(s_z, a_min=1e-5, a_max=None) - scores_z_norm[:,i] = (scores[:,i] - mu_z.T)/s_z.T - + scores_z_norm[:, i] = (scores[:, i] - mu_z.T) / s_z.T - best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[:,self.nbest_discard:self.nbest_discard+nbest] + best_idx = np.fliplr(np.argsort(scores_enr_coh, axis=1))[ + :, self.nbest_discard : self.nbest_discard + nbest + ] scores_t_norm = np.zeros_like(scores) for i in range(scores.shape[0]): best_idx_i = best_idx[i] - - mu_z = np.mean(scores_coh_test[best_idx_i,:], - axis=0, keepdims=True) + + mu_z = np.mean(scores_coh_test[best_idx_i, :], axis=0, keepdims=True) if mask_coh_test is None: - s_z = np.std(scores_coh_test[best_idx_i,:], - axis=0, keepdims=True) + s_z = np.std(scores_coh_test[best_idx_i, :], axis=0, keepdims=True) else: - norm = np.mean(mask_coh_test[best_idx_i,:], - axis=0, keepdims=True) + norm = np.mean(mask_coh_test[best_idx_i, :], axis=0, keepdims=True) mu_z /= norm - s_z = np.sqrt(np.mean(scores_coh_test[best_idx_i,:]**2, - axis=0, keepdims=True)/norm - mu_z**2) + s_z = np.sqrt( + np.mean(scores_coh_test[best_idx_i, :] ** 2, axis=0, keepdims=True) + / norm + - mu_z ** 2 + ) s_z = np.clip(s_z, a_min=1e-5, a_max=None) - scores_t_norm[i,:] = (scores[i,:] - mu_z)/s_z + scores_t_norm[i, :] = (scores[i, :] - mu_z) / s_z - - return (scores_z_norm + scores_t_norm)/np.sqrt(2) - + return (scores_z_norm + scores_t_norm) / np.sqrt(2) diff --git a/hyperion/score_norm/s_norm.py b/hyperion/score_norm/s_norm.py index 30686c60..ee00a7e8 100644 --- a/hyperion/score_norm/s_norm.py +++ b/hyperion/score_norm/s_norm.py @@ -10,21 +10,25 @@ from .t_norm import TNorm from .z_norm import ZNorm + class SNorm(ScoreNorm): - """ Class for S-Norm, symmetric score normalization. - """ + """Class for S-Norm, symmetric score normalization.""" + def __init__(self, **kwargs): super(SNorm, self).__init__(*kwargs) self.t_norm = TNorm(**kwargs) self.z_norm = ZNorm(**kwargs) - - - def predict(self, scores, scores_coh_test, scores_enr_coh, - mask_coh_test=None, mask_enr_coh=None): + def predict( + self, + scores, + scores_coh_test, + scores_enr_coh, + mask_coh_test=None, + mask_enr_coh=None, + ): scores_z_norm = self.z_norm.predict(scores, scores_enr_coh, mask_enr_coh) scores_t_norm = self.t_norm.predict(scores, scores_coh_test, mask_coh_test) - - return (scores_z_norm + scores_t_norm)/np.sqrt(2) - + + return (scores_z_norm + scores_t_norm) / np.sqrt(2) diff --git a/hyperion/score_norm/score_norm.py b/hyperion/score_norm/score_norm.py index d32a2b0e..f20a0b98 100644 --- a/hyperion/score_norm/score_norm.py +++ b/hyperion/score_norm/score_norm.py @@ -7,12 +7,12 @@ from ..hyp_model import HypModel + class ScoreNorm(HypModel): """ Base class for score normalization """ + def __init__(self, std_floor=1e-5, **kwargs): super(ScoreNorm, self).__init__(*kwargs) self.std_floor = std_floor - - diff --git a/hyperion/score_norm/t_norm.py b/hyperion/score_norm/t_norm.py index 8fae08f0..3fb92548 100644 --- a/hyperion/score_norm/t_norm.py +++ b/hyperion/score_norm/t_norm.py @@ -8,21 +8,24 @@ from .score_norm import ScoreNorm + class TNorm(ScoreNorm): - """Class for T-Norm score normalization. - """ + """Class for T-Norm score normalization.""" + def predict(self, scores, scores_coh_test, mask=None): if mask is None: mu_t = np.mean(scores_coh_test, axis=0, keepdims=True) s_t = np.std(scores_coh_test, axis=0, keepdims=True) else: - scores_coh_test[mask==False] = 0 + scores_coh_test[mask == False] = 0 n_t = np.mean(mask, axis=0, keepdims=True) - mu_t = np.mean(scores_coh_test, axis=0, keepdims=True)/n_t - s_t = np.sqrt(np.mean(scores_coh_test**2, axis=0, keepdims=True)/n_t - mu_t**2) - - s_t[s_t0) + # image + l0 = x.shape[2] * x.shape[3] - torch.sum( + torch.sum(d_x < 0.0001, dim=2) > 0 + ) if self.indep_channels: total_change = torch.abs(dy_x) * d_x valid = torch.flatten(valid) valid_all = valid else: - total_change = torch.sum(torch.abs(dy_x, dim=1))*torch.sum(d_x, dim=1) - valid = valid.view(x.shape[1], x.shape[2]*x.shape[3]) + total_change = torch.sum(torch.abs(dy_x, dim=1)) * torch.sum( + d_x, dim=1 + ) + valid = valid.view(x.shape[1], x.shape[2] * x.shape[3]) valid_all = torch.sum(valid, dim=0) > 0 else: raise NotImplementedError() @@ -146,43 +173,52 @@ def _generate_one(self, x, target): l0 = float(l0) for idx in torch.argsort(total_change): if valid_all[idx]: - change_count += 1 + change_count += 1 if valid.dim() == 1: valid[idx] = 0 else: valid[:, idx] = 0 - + # if total_change[idx] > .01 #this is what is hard coded in carlini's code but this makes optim very slow, it just removes one sample at a time, not feasible for speech - if total_change[idx] > 0.5*max_change: - #if change is big we stop putting elements to 0 - logging.info('break because of large total-change ' - '{} > {}'.format( - total_change[idx], 0.5*max_change)) + if total_change[idx] > 0.5 * max_change: + # if change is big we stop putting elements to 0 + logging.info( + "break because of large total-change " + "{} > {}".format(total_change[idx], 0.5 * max_change) + ) break - if change_count >= 0.5*l0: #in carlini's code 0.3*l0**.5 + if change_count >= 0.5 * l0: # in carlini's code 0.3*l0**.5 # if we put to many elements to 0, we stop - logging.info('break because large change-count ' - '{} >= {} l0={}'.format( - change_count, 0.5*float(l0), l0)) + logging.info( + "break because large change-count " + "{} >= {} l0={}".format(change_count, 0.5 * float(l0), l0) + ) break - logging.info('----carlini-wagner-l0--l0-optim it={} x-shape={} ' - 'l0={} c={}' - 'cur-num-valid-changes={} next-num-valid-changes={} ' - 'avg-total-change={} ' - 'max-total-change={} '.format( - cur_it, x.shape, l0, c, - cur_num_valid, cur_num_valid-change_count, - avg_change, max_change)) + logging.info( + "----carlini-wagner-l0--l0-optim it={} x-shape={} " + "l0={} c={}" + "cur-num-valid-changes={} next-num-valid-changes={} " + "avg-total-change={} " + "max-total-change={} ".format( + cur_it, + x.shape, + l0, + c, + cur_num_valid, + cur_num_valid - change_count, + avg_change, + max_change, + ) + ) valid = valid.view_as(x) best_adv = x_adv cur_it += 1 - def generate(self, input, target): - + if self.is_binary is None: # run the model to know weather is binary classification problem or multiclass z = self.model(input) @@ -197,4 +233,3 @@ def generate(self, input, target): x_adv[i] = self._generate_one(input[i], target[i]) return x_adv - diff --git a/hyperion/torch/adv_attacks/carlini_wagner_l2.py b/hyperion/torch/adv_attacks/carlini_wagner_l2.py index 7ef51a22..27cffe97 100644 --- a/hyperion/torch/adv_attacks/carlini_wagner_l2.py +++ b/hyperion/torch/adv_attacks/carlini_wagner_l2.py @@ -12,46 +12,64 @@ from .carlini_wagner import CarliniWagner -class CarliniWagnerL2(CarliniWagner): - def __init__(self, model, confidence=0.0, lr=1e-2, - binary_search_steps=9, max_iter=10000, - abort_early=True, initial_c=1e-3, - norm_time=False, time_dim=None, use_snr=False, - targeted=False, range_min=None, range_max=None): +class CarliniWagnerL2(CarliniWagner): + def __init__( + self, + model, + confidence=0.0, + lr=1e-2, + binary_search_steps=9, + max_iter=10000, + abort_early=True, + initial_c=1e-3, + norm_time=False, + time_dim=None, + use_snr=False, + targeted=False, + range_min=None, + range_max=None, + ): super().__init__( - model, confidence=confidence, lr=lr, + model, + confidence=confidence, + lr=lr, max_iter=max_iter, - abort_early=abort_early, initial_c=initial_c, - norm_time=norm_time, time_dim=time_dim, use_snr=use_snr, - targeted=targeted, range_min=range_min, range_max=range_max) + abort_early=abort_early, + initial_c=initial_c, + norm_time=norm_time, + time_dim=time_dim, + use_snr=use_snr, + targeted=targeted, + range_min=range_min, + range_max=range_max, + ) self.binary_search_steps = binary_search_steps self.repeat = binary_search_steps >= 10 - @property def attack_info(self): info = super().attack_info if self.use_snr: - threat = 'snr' + threat = "snr" else: - threat = 'l2' - new_info = {'binary_search_steps': self.binary_search_steps, - 'threat_model': threat, - 'attack_type': 'cw-l2' } + threat = "l2" + new_info = { + "binary_search_steps": self.binary_search_steps, + "threat_model": threat, + "attack_type": "cw-l2", + } info.update(new_info) return info - @staticmethod def _compute_negsnr(x_norm, d_norm): - return 20*(torch.log10(d_norm) - torch.log10(x_norm)) - + return 20 * (torch.log10(d_norm) - torch.log10(x_norm)) def generate(self, input, target): - + if self.is_binary is None: # run the model to know weather is binary classification problem or multiclass z = self.model(input) @@ -61,15 +79,15 @@ def generate(self, input, target): self.is_binary = None del z - norm_dim = tuple([i for i in range(1,input.dim())]) + norm_dim = tuple([i for i in range(1, input.dim())]) if self.use_snr: x_norm = torch.norm(input, dim=norm_dim) - w0 = self.w_x(input).detach() #transform x into tanh space - + w0 = self.w_x(input).detach() # transform x into tanh space + batch_size = input.shape[0] - global_best_norm = 1e10*torch.ones(batch_size, device=input.device) + global_best_norm = 1e10 * torch.ones(batch_size, device=input.device) global_success = torch.zeros(batch_size, dtype=torch.bool, device=input.device) best_adv = input.clone() @@ -79,18 +97,19 @@ def generate(self, input, target): for bs_step in range(self.binary_search_steps): - if self.repeat and bs_step == self.binary_search_steps-1: + if self.repeat and bs_step == self.binary_search_steps - 1: # The last iteration (if we run many steps) repeat the search once. c = c_upper_bound - logging.info('---carlini-wagner bin-search-step={}, c={}'.format( - bs_step, c)) - + logging.info( + "---carlini-wagner bin-search-step={}, c={}".format(bs_step, c) + ) + modifier = 1e-3 * torch.randn_like(w0).detach() modifier.requires_grad = True opt = optim.Adam([modifier], lr=self.lr) loss_prev = 1e10 - best_norm = 1e10*torch.ones(batch_size, device=w0.device) + best_norm = 1e10 * torch.ones(batch_size, device=w0.device) success = torch.zeros(batch_size, dtype=torch.bool, device=w0.device) for opt_step in range(self.max_iter): @@ -99,15 +118,19 @@ def generate(self, input, target): x_adv = self.x_w(w) z = self.model(x_adv) f = self.f(z, target) - delta = x_adv-input + delta = x_adv - input d_norm = torch.norm(delta, dim=norm_dim) if self.use_snr: # minimize the negative SNR(dB) d_norm = self._compute_negsnr(x_norm, d_norm) elif self.norm_time: # normalize by number of samples to get rms value - logging.info('rms {} {}'.format(input.shape, math.sqrt(float(input.shape[self.time_dim])))) - d_norm = d_norm/math.sqrt(float(input.shape[self.time_dim])) + logging.info( + "rms {} {}".format( + input.shape, math.sqrt(float(input.shape[self.time_dim])) + ) + ) + d_norm = d_norm / math.sqrt(float(input.shape[self.time_dim])) loss1 = d_norm.mean() loss2 = (c * f).mean() @@ -116,54 +139,72 @@ def generate(self, input, target): loss.backward() opt.step() - #if the attack is successful f(x+delta)==0 - step_success = (f < 0.0001) + # if the attack is successful f(x+delta)==0 + step_success = f < 0.0001 - #find elements that reduced l2 and where successful for current c value + # find elements that reduced l2 and where successful for current c value improv_idx = (d_norm < best_norm) & step_success best_norm[improv_idx] = d_norm[improv_idx] success[improv_idx] = 1 - #find elements that reduced l2 and where successful for global optimization + # find elements that reduced l2 and where successful for global optimization improv_idx = (d_norm < global_best_norm) & step_success global_best_norm[improv_idx] = d_norm[improv_idx] global_success[improv_idx] = 1 best_adv[improv_idx] = x_adv[improv_idx] - if opt_step % (self.max_iter//10) == 0: - logging.info('----carlini-wagner bin-search-step={0:d}, ' - 'opt-step={1:d}/{2:d} ' - 'loss={3:.2f} d_norm={4:.2f} cf={5:.4f} ' - 'num-success={6:d}'.format( - bs_step, opt_step, self.max_iter, - loss.item(), loss1.item(), loss2.item(), - torch.sum(step_success))) - - logging.info('----carlini-wagner bin-search-step={}, ' - 'opt-step={}/{} ' - 'step_success={}, success={} best_norm={} ' - 'global_success={} ' - 'global_best_norm={} d_norm={}'.format( - bs_step, opt_step, self.max_iter, - step_success, success, best_norm, - global_success, global_best_norm, d_norm)) + if opt_step % (self.max_iter // 10) == 0: + logging.info( + "----carlini-wagner bin-search-step={0:d}, " + "opt-step={1:d}/{2:d} " + "loss={3:.2f} d_norm={4:.2f} cf={5:.4f} " + "num-success={6:d}".format( + bs_step, + opt_step, + self.max_iter, + loss.item(), + loss1.item(), + loss2.item(), + torch.sum(step_success), + ) + ) + + logging.info( + "----carlini-wagner bin-search-step={}, " + "opt-step={}/{} " + "step_success={}, success={} best_norm={} " + "global_success={} " + "global_best_norm={} d_norm={}".format( + bs_step, + opt_step, + self.max_iter, + step_success, + success, + best_norm, + global_success, + global_best_norm, + d_norm, + ) + ) loss_it = loss.item() if self.abort_early: - if loss_it > 0.999*loss_prev: - logging.info('----carlini-wagner abort-early ' - 'bin-search-step={}, opt-step={}/{} ' - 'loss={}, loss_prev={}'.format( - bs_step, opt_step, self.max_iter, - loss_it, loss_prev)) + if loss_it > 0.999 * loss_prev: + logging.info( + "----carlini-wagner abort-early " + "bin-search-step={}, opt-step={}/{} " + "loss={}, loss_prev={}".format( + bs_step, opt_step, self.max_iter, loss_it, loss_prev + ) + ) break loss_prev = loss_it - - #readjust c + + # readjust c c_upper_bound[success] = torch.min(c_upper_bound[success], c[success]) c_lower_bound[~success] = torch.max(c_lower_bound[~success], c[~success]) avg_c_idx = c_upper_bound < 1e9 - c[avg_c_idx] = (c_lower_bound[avg_c_idx] + c_upper_bound[avg_c_idx])/2 + c[avg_c_idx] = (c_lower_bound[avg_c_idx] + c_upper_bound[avg_c_idx]) / 2 cx10_idx = (~success) & (~avg_c_idx) c[cx10_idx] *= 10 diff --git a/hyperion/torch/adv_attacks/carlini_wagner_linf.py b/hyperion/torch/adv_attacks/carlini_wagner_linf.py index 0e7a1aa0..d9ee779c 100644 --- a/hyperion/torch/adv_attacks/carlini_wagner_linf.py +++ b/hyperion/torch/adv_attacks/carlini_wagner_linf.py @@ -11,36 +11,52 @@ from .carlini_wagner import CarliniWagner -class CarliniWagnerLInf(CarliniWagner): - def __init__(self, model, confidence=0.0, lr=1e-2, - max_iter=10000, - abort_early=True, initial_c=1e-3, reduce_c=False, - c_incr_factor=2, tau_decr_factor=0.9, - targeted=False, range_min=None, range_max=None): +class CarliniWagnerLInf(CarliniWagner): + def __init__( + self, + model, + confidence=0.0, + lr=1e-2, + max_iter=10000, + abort_early=True, + initial_c=1e-3, + reduce_c=False, + c_incr_factor=2, + tau_decr_factor=0.9, + targeted=False, + range_min=None, + range_max=None, + ): super().__init__( - model, confidence=confidence, lr=lr, + model, + confidence=confidence, + lr=lr, max_iter=max_iter, - abort_early=abort_early, initial_c=initial_c, - targeted=targeted, range_min=range_min, range_max=range_max) + abort_early=abort_early, + initial_c=initial_c, + targeted=targeted, + range_min=range_min, + range_max=range_max, + ) self.reduce_c = reduce_c self.c_incr_factor = c_incr_factor self.tau_decr_factor = tau_decr_factor - @property def attack_info(self): info = super().attack_info - new_info = {'reduce_c': self.reduce_c, - 'c_incr_factor': self.c_incr_factor, - 'tau_decr_factor': self.tau_decr_factor, - 'threat_model': 'linf', - 'attack_type': 'cw-linf'} + new_info = { + "reduce_c": self.reduce_c, + "c_incr_factor": self.c_incr_factor, + "tau_decr_factor": self.tau_decr_factor, + "threat_model": "linf", + "attack_type": "cw-linf", + } info.update(new_info) return info - def _attack(self, x, target, start_adv, tau, c): w_start = self.w_x(start_adv).detach() @@ -65,19 +81,26 @@ def _attack(self, x, target, start_adv, tau, c): loss.backward() opt.step() - #if the attack is successful f(x+delta)==0 - step_success = (f < 1e-4) - if opt_step % (self.max_iter//10) == 0: - logging.info('--------carlini-wagner-linf--l1-optim ' - 'c_step={0:d} opt-step={1:d} c={2:f} ' - 'loss={3:.2f} d_norm={4:.2f} cf={5:.5f} ' - 'success={6}'.format( - c_step, opt_step, c, - loss.item(), loss1.item()+tau, loss2.item(), - bool(step_success.item()))) - + # if the attack is successful f(x+delta)==0 + step_success = f < 1e-4 + if opt_step % (self.max_iter // 10) == 0: + logging.info( + "--------carlini-wagner-linf--l1-optim " + "c_step={0:d} opt-step={1:d} c={2:f} " + "loss={3:.2f} d_norm={4:.2f} cf={5:.5f} " + "success={6}".format( + c_step, + opt_step, + c, + loss.item(), + loss1.item() + tau, + loss2.item(), + bool(step_success.item()), + ) + ) + loss_it = loss.item() - if loss_it <=0 or (step_success and self.abort_early): + if loss_it <= 0 or (step_success and self.abort_early): break if step_success: @@ -87,38 +110,39 @@ def _attack(self, x, target, start_adv, tau, c): c_step += 1 return None - def _generate_one(self, x, target): - + x = x.unsqueeze(dim=0) target = target.unsqueeze(dim=0) best_adv = x c = self.initial_c tau_max = max(abs(self.range_max), abs(self.range_min)) - tau_min = 1./256 + tau_min = 1.0 / 256 tau = tau_max cur_it = 0 while tau > tau_min: res = self._attack(x, target, best_adv, tau, c) if res is None: - logging.info('----carlini-wagner-linf--return it={} x-shape={} ' - 'tau={} c={}'.format( - cur_it, x.shape, tau, c)) + logging.info( + "----carlini-wagner-linf--return it={} x-shape={} " + "tau={} c={}".format(cur_it, x.shape, tau, c) + ) return best_adv[0] x_adv, c = res if self.reduce_c: c /= 2 - actual_tau = torch.max(torch.abs(x-x_adv)) + actual_tau = torch.max(torch.abs(x - x_adv)) if actual_tau < tau: tau = actual_tau - logging.info('----carlini-wagner-lin--tau-optim it={} x-shape={} ' - 'tau={}'.format( - cur_it, x.shape, tau)) + logging.info( + "----carlini-wagner-lin--tau-optim it={} x-shape={} " + "tau={}".format(cur_it, x.shape, tau) + ) best_adv = x_adv tau *= self.tau_decr_factor @@ -126,9 +150,8 @@ def _generate_one(self, x, target): return best_adv[0] - def generate(self, input, target): - + if self.is_binary is None: # run the model to know weather is binary classification problem or multiclass z = self.model(input) @@ -143,4 +166,3 @@ def generate(self, input, target): x_adv[i] = self._generate_one(input[i], target[i]) return x_adv - diff --git a/hyperion/torch/adv_attacks/fgsm_attack.py b/hyperion/torch/adv_attacks/fgsm_attack.py index 970a316f..06065824 100644 --- a/hyperion/torch/adv_attacks/fgsm_attack.py +++ b/hyperion/torch/adv_attacks/fgsm_attack.py @@ -6,24 +6,21 @@ from .adv_attack import AdvAttack -class FGSMAttack(AdvAttack): - def __init__(self, model, eps, loss=None, targeted=False, range_min=None, range_max=None): - super().__init__( - model, loss, targeted, range_min, range_max) +class FGSMAttack(AdvAttack): + def __init__( + self, model, eps, loss=None, targeted=False, range_min=None, range_max=None + ): + super().__init__(model, loss, targeted, range_min, range_max) self.eps = eps - @property def attack_info(self): info = super().attack_info - new_info = {'eps': self.eps, - 'threat_model': 'linf', - 'attack_type': 'fgsm' } + new_info = {"eps": self.eps, "threat_model": "linf", "attack_type": "fgsm"} info.update(new_info) return info - def generate(self, input, target): input.requires_grad = True @@ -32,12 +29,10 @@ def generate(self, input, target): self.model.zero_grad() loss.backward() dL_x = input.grad.data - + f = 1 if self.targeted: f = -1 adv_ex = input + f * self.eps * dL_x.sign() return self._clamp(adv_ex) - - diff --git a/hyperion/torch/adv_attacks/iter_fgsm_attack.py b/hyperion/torch/adv_attacks/iter_fgsm_attack.py index fd1e5857..bf3c7009 100644 --- a/hyperion/torch/adv_attacks/iter_fgsm_attack.py +++ b/hyperion/torch/adv_attacks/iter_fgsm_attack.py @@ -6,29 +6,36 @@ from .adv_attack import AdvAttack -class IterFGSMAttack(AdvAttack): - def __init__(self, model, eps, alpha, loss=None, - targeted=False, range_min=None, range_max=None): - super().__init__( - model, loss, targeted, range_min, range_max) +class IterFGSMAttack(AdvAttack): + def __init__( + self, + model, + eps, + alpha, + loss=None, + targeted=False, + range_min=None, + range_max=None, + ): + super().__init__(model, loss, targeted, range_min, range_max) self.eps = eps self.alpha = alpha - self.max_iter = int(1.25*eps/alpha) - + self.max_iter = int(1.25 * eps / alpha) @property def attack_info(self): info = super().attack_info - new_info = {'eps': self.eps, - 'alpha': self.alpha, - 'max_iter': self.max_iter, - 'threat_model': 'linf', - 'attack_type': 'iter-fgsm'} + new_info = { + "eps": self.eps, + "alpha": self.alpha, + "max_iter": self.max_iter, + "threat_model": "linf", + "attack_type": "iter-fgsm", + } info.update(new_info) return info - def generate(self, input, target): f = 1 @@ -45,8 +52,6 @@ def generate(self, input, target): loss.backward() dL_x = x.grad.data x = x + f * self.alpha * dL_x.sign() - x = input + torch.clamp(x-input, -self.eps, self.eps) - - return self._clamp(x) + x = input + torch.clamp(x - input, -self.eps, self.eps) - + return self._clamp(x) diff --git a/hyperion/torch/adv_attacks/pgd_attack.py b/hyperion/torch/adv_attacks/pgd_attack.py index 03b05882..879531ed 100644 --- a/hyperion/torch/adv_attacks/pgd_attack.py +++ b/hyperion/torch/adv_attacks/pgd_attack.py @@ -8,14 +8,25 @@ import torch from .adv_attack import AdvAttack -class PGDAttack(AdvAttack): - def __init__(self, model, eps, alpha, norm, max_iter=10, - random_eps=False, num_random_init=0, loss=None, - norm_time=False, time_dim=None, - targeted=False, range_min=None, range_max=None): - super().__init__( - model, loss, targeted, range_min, range_max) +class PGDAttack(AdvAttack): + def __init__( + self, + model, + eps, + alpha, + norm, + max_iter=10, + random_eps=False, + num_random_init=0, + loss=None, + norm_time=False, + time_dim=None, + targeted=False, + range_min=None, + range_max=None, + ): + super().__init__(model, loss, targeted, range_min, range_max) self.eps = eps self.alpha = alpha self.max_iter = max_iter @@ -25,94 +36,93 @@ def __init__(self, model, eps, alpha, norm, max_iter=10, self.norm_time = norm_time self.time_dim = time_dim - @property def attack_info(self): info = super().attack_info if self.norm == 1: - threat = 'l1' + threat = "l1" elif self.norm == 2: - threat = 'l2' + threat = "l2" else: - threat = 'linf' - - new_info = {'eps': self.eps, - 'alpha': self.alpha, - 'norm': self.norm, - 'max_iter': self.max_iter, - 'random_eps': self.random_eps, - 'num_random_init': self.num_random_init, - 'threat_model': threat, - 'attack_type': 'pgd', - 'norm_time': self.norm_time } + threat = "linf" + + new_info = { + "eps": self.eps, + "alpha": self.alpha, + "norm": self.norm, + "max_iter": self.max_iter, + "random_eps": self.random_eps, + "num_random_init": self.num_random_init, + "threat_model": threat, + "attack_type": "pgd", + "norm_time": self.norm_time, + } info.update(new_info) return info - @staticmethod def _project(delta, eps, norm): - if norm == 'inf' or norm == float('inf'): + if norm == "inf" or norm == float("inf"): return torch.clamp(delta, -eps, eps) delta_tmp = torch.reshape(delta, (delta.shape[0], -1)) one = torch.ones((1,), dtype=delta.dtype, device=delta.device) if norm == 2: - delta_tmp = delta_tmp*torch.min( - one, eps/torch.norm(delta_tmp, dim=1, keepdim=True)) + delta_tmp = delta_tmp * torch.min( + one, eps / torch.norm(delta_tmp, dim=1, keepdim=True) + ) elif norm == 1: - delta_tmp = delta_tmp*torch.min( - one, eps/torch.norm(delta_tmp, dim=1, keepdim=True, p=1)) + delta_tmp = delta_tmp * torch.min( + one, eps / torch.norm(delta_tmp, dim=1, keepdim=True, p=1) + ) else: - raise Exception('norm={} not supported'.format(norm)) - - return torch.reshape(delta_tmp, delta.shape) + raise Exception("norm={} not supported".format(norm)) + return torch.reshape(delta_tmp, delta.shape) @staticmethod def _random_sphere(shape, eps, norm, dtype, device): """We use Theorem 1 in https://arxiv.org/pdf/math/0503650.pdf - to sample uniformly from l_p balls in R^n + to sample uniformly from l_p balls in R^n """ - if norm == 'inf' or norm == float('inf'): - return 2*eps*(torch.rand(shape, dtype=dtype, device=device)-0.5) + if norm == "inf" or norm == float("inf"): + return 2 * eps * (torch.rand(shape, dtype=dtype, device=device) - 0.5) # Sample from exponential e^(-t) distribution - u = torch.rand((shape[0],1), dtype=dtype, device=device) - z = - (-u).log1p() + u = torch.rand((shape[0], 1), dtype=dtype, device=device) + z = -(-u).log1p() if norm == 2: # sample from \propto exp(-|t|^p) - u = torch.randn(shape, dtype=dtype, device=device).reshape(shape[0],-1) + u = torch.randn(shape, dtype=dtype, device=device).reshape(shape[0], -1) # compute norm l2 = torch.norm(u, dim=1, keepdim=True) # apply theorem and rescale norm - x = eps * u / (l2**2 + z).sqrt() + x = eps * u / (l2 ** 2 + z).sqrt() elif norm == 1: # sample from \propto exp(-|t|^p) - u = torch.rand(shape, dtype=dtype, device=device).reshape(shape[0],-1) - u = - (-u).log1p() + u = torch.rand(shape, dtype=dtype, device=device).reshape(shape[0], -1) + u = -(-u).log1p() # compute norm l1 = torch.norm(u, dim=1, keepdim=True, p=1) # apply theorem and rescale norm x = eps * u / (l1 + z) else: - raise Exception('norm={} not supported'.format(norm)) + raise Exception("norm={} not supported".format(norm)) return x.reshape(shape) - - def generate(self, input, target): f = 1 if self.targeted: f = -1 - + if self.random_eps: - eps = self.eps*torch.rand(1).item() - alpha = eps*self.alpha/self.eps + eps = self.eps * torch.rand(1).item() + alpha = eps * self.alpha / self.eps else: eps = self.eps alpha = self.alpha @@ -132,8 +142,7 @@ def generate(self, input, target): for k in range(max(1, self.num_random_init)): x = input if self.num_random_init > 0: - x = x + self._random_sphere( - x.shape, eps, self.norm , x.dtype, x.device) + x = x + self._random_sphere(x.shape, eps, self.norm, x.dtype, x.device) x = self._clamp(x) for it in range(self.max_iter): @@ -145,7 +154,7 @@ def generate(self, input, target): loss.backward() dL_x = x.grad.data x = x + f * alpha * dL_x.sign() - delta = self._project(x-input, eps, self.norm) + delta = self._project(x - input, eps, self.norm) x = input + delta x = self._clamp(x) @@ -156,11 +165,9 @@ def generate(self, input, target): output = self.model(x) loss = self.loss(output, target).mean().item() - #if nontargeted we want higher loss, if targeted we want lower loss + # if nontargeted we want higher loss, if targeted we want lower loss if best_loss is None or best_loss < f * loss: best_x = x best_loss = f * loss - - return best_x - + return best_x diff --git a/hyperion/torch/adv_attacks/rand_fgsm_attack.py b/hyperion/torch/adv_attacks/rand_fgsm_attack.py index 3e713796..69d9c855 100644 --- a/hyperion/torch/adv_attacks/rand_fgsm_attack.py +++ b/hyperion/torch/adv_attacks/rand_fgsm_attack.py @@ -6,28 +6,36 @@ from .adv_attack import AdvAttack -class RandFGSMAttack(AdvAttack): - - def __init__(self, model, eps, alpha, loss=None, targeted=False, range_min=None, range_max=None): - super().__init__( - model, loss, targeted, range_min, range_max) - assert alpha < eps, 'alpha({}) >= eps({})'.format(alpha, eps) +class RandFGSMAttack(AdvAttack): + def __init__( + self, + model, + eps, + alpha, + loss=None, + targeted=False, + range_min=None, + range_max=None, + ): + super().__init__(model, loss, targeted, range_min, range_max) + + assert alpha < eps, "alpha({}) >= eps({})".format(alpha, eps) self.eps = eps self.alpha = alpha - @property def attack_info(self): info = super().attack_info - new_info = {'eps': self.eps, - 'alpha': self.alpha, - 'threat_model': 'linf', - 'attack_type': 'rand-fgsm'} + new_info = { + "eps": self.eps, + "alpha": self.alpha, + "threat_model": "linf", + "attack_type": "rand-fgsm", + } info.update(new_info) return info - def generate(self, input, target): x = input + self.alpha * torch.randn_like(input).sign() @@ -38,12 +46,10 @@ def generate(self, input, target): self.model.zero_grad() loss.backward() dL_x = x.grad.data - + f = 1 if self.targeted: f = -1 adv_ex = x + f * (self.eps - self.alpha) * dL_x.sign() return self._clamp(adv_ex) - - diff --git a/hyperion/torch/adv_attacks/random_attack_factory.py b/hyperion/torch/adv_attacks/random_attack_factory.py index fe7fff09..e333b119 100644 --- a/hyperion/torch/adv_attacks/random_attack_factory.py +++ b/hyperion/torch/adv_attacks/random_attack_factory.py @@ -9,32 +9,45 @@ import torch from .attack_factory import AttackFactory as AF -class RandomAttackFactory(object): - def __init__(self, attack_types, - min_eps=1e-5, max_eps=0.1, - min_snr=30, max_snr=60, - min_alpha=1e-5, max_alpha=0.02, - norms=[float('inf')], - random_eps=False, - min_num_random_init=0, max_num_random_init=3, - min_confidence=0, max_confidence=1, - min_lr=1e-3, max_lr=1e-2, - min_binary_search_steps=9, max_binary_search_steps=9, - min_iter=5, max_iter=10, - abort_early=True, - min_c=1e-3, max_c=1e-2, - reduce_c=False, - c_incr_factor=2, - tau_decr_factor=0.9, - indep_channels=False, - norm_time=False, - time_dim=None, - use_snr=False, - loss=None, - targeted=False, - range_min=None, range_max=None, - eps_scale=1): +class RandomAttackFactory(object): + def __init__( + self, + attack_types, + min_eps=1e-5, + max_eps=0.1, + min_snr=30, + max_snr=60, + min_alpha=1e-5, + max_alpha=0.02, + norms=[float("inf")], + random_eps=False, + min_num_random_init=0, + max_num_random_init=3, + min_confidence=0, + max_confidence=1, + min_lr=1e-3, + max_lr=1e-2, + min_binary_search_steps=9, + max_binary_search_steps=9, + min_iter=5, + max_iter=10, + abort_early=True, + min_c=1e-3, + max_c=1e-2, + reduce_c=False, + c_incr_factor=2, + tau_decr_factor=0.9, + indep_channels=False, + norm_time=False, + time_dim=None, + use_snr=False, + loss=None, + targeted=False, + range_min=None, + range_max=None, + eps_scale=1, + ): self.attack_types = attack_types self.min_eps = min_eps @@ -45,8 +58,8 @@ def __init__(self, attack_types, self.max_alpha = max_alpha self.norms = norms self.random_eps = random_eps - self.min_num_random_init = min_num_random_init - self.max_num_random_init = max_num_random_init + self.min_num_random_init = min_num_random_init + self.max_num_random_init = max_num_random_init self.min_confidence = min_confidence self.max_confidence = max_confidence self.min_lr = min_lr @@ -71,236 +84,353 @@ def __init__(self, attack_types, self.range_max = range_max self.eps_scale = eps_scale - @staticmethod def _choice(n): return torch.randint(low=0, high=n, size=(1,)).item() - @staticmethod def _randint(min_val, max_val): - return torch.randint(low=min_val, high=max_val+1, size=(1,)).item() - + return torch.randint(low=min_val, high=max_val + 1, size=(1,)).item() @staticmethod def _uniform(min_val, max_val): return (max_val - min_val) * torch.rand(size=(1,)).item() + min_val - @staticmethod def _log_uniform(min_val, max_val): - log_x = (math.log(max_val) - math.log(min_val)) * torch.rand(size=(1,)).item() + math.log(min_val) + log_x = (math.log(max_val) - math.log(min_val)) * torch.rand( + size=(1,) + ).item() + math.log(min_val) return math.exp(log_x) - def _sample_attack_args(self): attack_args = {} attack_idx = self._choice(len(self.attack_types)) - attack_args['attack_type'] = self.attack_types[attack_idx] - eps = self._log_uniform(self.min_eps, self.max_eps) - attack_args['eps'] = eps - attack_args['alpha'] = self._log_uniform( - min(eps, self.min_alpha), - min(eps, self.max_alpha)) - attack_args['norm'] = self.norms[self._choice(len(self.norms))] - attack_args['random_eps'] = self.random_eps - attack_args['num_random_init'] = self._randint( - self.min_num_random_init, self.max_num_random_init) - attack_args['confidence'] = self._uniform( - self.min_confidence, self.max_confidence) - attack_args['lr'] = self._uniform(self.min_lr, self.max_lr) - attack_args['binary_search_steps'] = self._randint( - self.min_binary_search_steps, self.max_binary_search_steps) - attack_args['max_iter'] = self._randint(self.min_iter, self.max_iter) - attack_args['abort_early'] = self.abort_early - attack_args['c'] = self._uniform(self.min_c, self.max_c) - attack_args['reduce_c'] = self.reduce_c - attack_args['c_incr_factor'] = self.c_incr_factor - attack_args['tau_decr_factor'] = self.tau_decr_factor - attack_args['indep_channels'] = self.indep_channels - attack_args['norm_time'] = self.norm_time - attack_args['time_dim'] = self.time_dim - attack_args['use_snr'] = self.use_snr - attack_args['targeted'] = self.targeted - attack_args['range_min'] = self.range_min - attack_args['range_max'] = self.range_max - attack_args['eps_scale'] = self.eps_scale - attack_args['loss'] = self.loss - - return attack_args + attack_args["attack_type"] = self.attack_types[attack_idx] + eps = self._log_uniform(self.min_eps, self.max_eps) + attack_args["eps"] = eps + attack_args["alpha"] = self._log_uniform( + min(eps, self.min_alpha), min(eps, self.max_alpha) + ) + attack_args["norm"] = self.norms[self._choice(len(self.norms))] + attack_args["random_eps"] = self.random_eps + attack_args["num_random_init"] = self._randint( + self.min_num_random_init, self.max_num_random_init + ) + attack_args["confidence"] = self._uniform( + self.min_confidence, self.max_confidence + ) + attack_args["lr"] = self._uniform(self.min_lr, self.max_lr) + attack_args["binary_search_steps"] = self._randint( + self.min_binary_search_steps, self.max_binary_search_steps + ) + attack_args["max_iter"] = self._randint(self.min_iter, self.max_iter) + attack_args["abort_early"] = self.abort_early + attack_args["c"] = self._uniform(self.min_c, self.max_c) + attack_args["reduce_c"] = self.reduce_c + attack_args["c_incr_factor"] = self.c_incr_factor + attack_args["tau_decr_factor"] = self.tau_decr_factor + attack_args["indep_channels"] = self.indep_channels + attack_args["norm_time"] = self.norm_time + attack_args["time_dim"] = self.time_dim + attack_args["use_snr"] = self.use_snr + attack_args["targeted"] = self.targeted + attack_args["range_min"] = self.range_min + attack_args["range_max"] = self.range_max + attack_args["eps_scale"] = self.eps_scale + attack_args["loss"] = self.loss + return attack_args def sample_attack(self, model=None): attack_args = self._sample_attack_args() - attack_args['model'] = model + attack_args["model"] = model return AF.create(**attack_args) - @staticmethod def filter_args(**kwargs): - if 'no_abort' in kwargs: - kwargs['abort_early'] = not kwargs['no_abort'] - - if 'norms' in kwargs: - kwargs['norms'] = [float(a) for a in kwargs['norms']] - - valid_args = ('attack_types', - 'min_eps', 'max_eps', - 'min_snr', 'max_snr', - 'norms', 'random_eps', - 'min_num_random_init', 'max_num_random_init', - 'min_alpha', 'max_alpha', - 'min_confidence', 'max_confidence', - 'min_lr', 'max_lr', - 'min_binary_search_steps', 'max_binary_search_steps', - 'min_iter', 'max_iter', 'abort_early', - 'min_c', 'max_c', 'reduce_c', - 'c_incr_factor', 'tau_decr_factor', - 'indep_channels', 'use_snr', 'norm_time', - 'targeted') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "no_abort" in kwargs: + kwargs["abort_early"] = not kwargs["no_abort"] + + if "norms" in kwargs: + kwargs["norms"] = [float(a) for a in kwargs["norms"]] + + valid_args = ( + "attack_types", + "min_eps", + "max_eps", + "min_snr", + "max_snr", + "norms", + "random_eps", + "min_num_random_init", + "max_num_random_init", + "min_alpha", + "max_alpha", + "min_confidence", + "max_confidence", + "min_lr", + "max_lr", + "min_binary_search_steps", + "max_binary_search_steps", + "min_iter", + "max_iter", + "abort_early", + "min_c", + "max_c", + "reduce_c", + "c_incr_factor", + "tau_decr_factor", + "indep_channels", + "use_snr", + "norm_time", + "targeted", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--attack-types', type=str.lower, default=['fgsm'], nargs='+', - choices=['fgsm', 'snr-fgsm', 'rand-fgsm', 'iter-fgsm', - 'cw-l0', 'cw-l2', 'cw-linf', 'pgd'], - help=('Attack types')) + "--attack-types", + type=str.lower, + default=["fgsm"], + nargs="+", + choices=[ + "fgsm", + "snr-fgsm", + "rand-fgsm", + "iter-fgsm", + "cw-l0", + "cw-l2", + "cw-linf", + "pgd", + ], + help=("Attack types"), + ) parser.add_argument( - '--norms', type=float, default=[float('inf')], nargs='+', - choices=[float('inf'), 1, 2], help=('Attack perturbation norms')) + "--norms", + type=float, + default=[float("inf")], + nargs="+", + choices=[float("inf"), 1, 2], + help=("Attack perturbation norms"), + ) parser.add_argument( - '--min-eps', default=1e-5, type=float, - help=('attack min epsilon, upper bound for the perturbation norm')) + "--min-eps", + default=1e-5, + type=float, + help=("attack min epsilon, upper bound for the perturbation norm"), + ) parser.add_argument( - '--max-eps', default=0.1, type=float, - help=('attack max epsilon, upper bound for the perturbation norm')) + "--max-eps", + default=0.1, + type=float, + help=("attack max epsilon, upper bound for the perturbation norm"), + ) parser.add_argument( - '--min-snr', default=30, type=float, - help=('min upper bound for the signal-to-noise ratio of the ' - 'perturbed signal')) + "--min-snr", + default=30, + type=float, + help=( + "min upper bound for the signal-to-noise ratio of the " + "perturbed signal" + ), + ) parser.add_argument( - '--max-snr', default=60, type=float, - help=('max upper bound for the signal-to-noise ratio of the ' - 'perturbed signal')) + "--max-snr", + default=60, + type=float, + help=( + "max upper bound for the signal-to-noise ratio of the " + "perturbed signal" + ), + ) parser.add_argument( - '--min-alpha', default=1e-5, type=float, - help=('min alpha for iter and rand fgsm attack')) + "--min-alpha", + default=1e-5, + type=float, + help=("min alpha for iter and rand fgsm attack"), + ) parser.add_argument( - '--max-alpha', default=0.02, type=float, - help=('max alpha for iter and rand fgsm attack')) + "--max-alpha", + default=0.02, + type=float, + help=("max alpha for iter and rand fgsm attack"), + ) parser.add_argument( - '--random-eps', default=False, action='store_true', - help=('use random epsilon in PGD attack')) + "--random-eps", + default=False, + action="store_true", + help=("use random epsilon in PGD attack"), + ) parser.add_argument( - '--min-confidence', default=0, type=float, - help=('min confidence for carlini-wagner attack')) + "--min-confidence", + default=0, + type=float, + help=("min confidence for carlini-wagner attack"), + ) parser.add_argument( - '--max-confidence', default=1, type=float, - help=('max confidence for carlini-wagner attack')) + "--max-confidence", + default=1, + type=float, + help=("max confidence for carlini-wagner attack"), + ) parser.add_argument( - '--min-lr', default=1e-3, type=float, - help=('min learning rate for attack optimizers')) + "--min-lr", + default=1e-3, + type=float, + help=("min learning rate for attack optimizers"), + ) parser.add_argument( - '--max-lr', default=1e-2, type=float, - help=('max learning rate for attack optimizers')) + "--max-lr", + default=1e-2, + type=float, + help=("max learning rate for attack optimizers"), + ) parser.add_argument( - '--min-binary-search-steps', default=9, type=int, - help=('min num bin. search steps in carlini-wagner-l2 attack')) + "--min-binary-search-steps", + default=9, + type=int, + help=("min num bin. search steps in carlini-wagner-l2 attack"), + ) parser.add_argument( - '--max-binary-search-steps', default=9, type=int, - help=('max num bin. search steps in carlini-wagner-l2 attack')) + "--max-binary-search-steps", + default=9, + type=int, + help=("max num bin. search steps in carlini-wagner-l2 attack"), + ) parser.add_argument( - '--min-iter', default=5, type=int, - help=('min maximum. num. of optim iters in attack')) + "--min-iter", + default=5, + type=int, + help=("min maximum. num. of optim iters in attack"), + ) parser.add_argument( - '--max-iter', default=10, type=int, - help=('max maximum num. of optim iters in attack')) + "--max-iter", + default=10, + type=int, + help=("max maximum num. of optim iters in attack"), + ) parser.add_argument( - '--min-c', default=1e-3, type=float, - help=('min initial weight of constraint function f ' - 'in carlini-wagner attack')) + "--min-c", + default=1e-3, + type=float, + help=( + "min initial weight of constraint function f " + "in carlini-wagner attack" + ), + ) parser.add_argument( - '--max-c', default=1e-2, type=float, - help=('max initial weight of constraint function f ' - 'in carlini-wagner attack')) + "--max-c", + default=1e-2, + type=float, + help=( + "max initial weight of constraint function f " + "in carlini-wagner attack" + ), + ) parser.add_argument( - '--reduce-c', default=False, action='store_true', - help=('allow to reduce c in carline-wagner-l0/inf attack')) + "--reduce-c", + default=False, + action="store_true", + help=("allow to reduce c in carline-wagner-l0/inf attack"), + ) parser.add_argument( - '--c-incr-factor', default=2, type=float, - help=('factor to increment c in carline-wagner-l0/inf attack')) + "--c-incr-factor", + default=2, + type=float, + help=("factor to increment c in carline-wagner-l0/inf attack"), + ) parser.add_argument( - '--tau-decr-factor', default=0.75, type=float, - help=('factor to reduce tau in carline-wagner-linf attack')) + "--tau-decr-factor", + default=0.75, + type=float, + help=("factor to reduce tau in carline-wagner-linf attack"), + ) parser.add_argument( - '--indep-channels', default=False, action='store_true', - help=('consider independent input channels in ' - 'carlini-wagner-l0 attack')) + "--indep-channels", + default=False, + action="store_true", + help=("consider independent input channels in " "carlini-wagner-l0 attack"), + ) parser.add_argument( - '--no-abort', default=False, action='store_true', - help=('do not abort early in optimizer iterations')) + "--no-abort", + default=False, + action="store_true", + help=("do not abort early in optimizer iterations"), + ) parser.add_argument( - '--min-num-random-init', default=1, type=int, - help=('min number of random initializations in PGD attack')) + "--min-num-random-init", + default=1, + type=int, + help=("min number of random initializations in PGD attack"), + ) parser.add_argument( - '--max-num-random-init', default=5, type=int, - help=('max number of random initializations in PGD attack')) + "--max-num-random-init", + default=5, + type=int, + help=("max number of random initializations in PGD attack"), + ) parser.add_argument( - '--targeted', default=False, action='store_true', - help='use targeted attack intead of non-targeted') + "--targeted", + default=False, + action="store_true", + help="use targeted attack intead of non-targeted", + ) parser.add_argument( - '--use-snr', default=False, action='store_true', - help=('In carlini-wagner attack maximize SNR instead of ' - 'minimize perturbation norm')) + "--use-snr", + default=False, + action="store_true", + help=( + "In carlini-wagner attack maximize SNR instead of " + "minimize perturbation norm" + ), + ) parser.add_argument( - '--norm-time', default=False, action='store_true', - help=('normalize norm by number of samples in time dimension')) + "--norm-time", + default=False, + action="store_true", + help=("normalize norm by number of samples in time dimension"), + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='adversarial attack options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='adversarial attack options') add_argparse_args = add_class_args diff --git a/hyperion/torch/adv_attacks/snr_fgsm_attack.py b/hyperion/torch/adv_attacks/snr_fgsm_attack.py index bcd8c3c6..88b96da3 100644 --- a/hyperion/torch/adv_attacks/snr_fgsm_attack.py +++ b/hyperion/torch/adv_attacks/snr_fgsm_attack.py @@ -6,24 +6,21 @@ from .adv_attack import AdvAttack -class SNRFGSMAttack(AdvAttack): - def __init__(self, model, snr, loss=None, targeted=False, - range_min=None, range_max=None): +class SNRFGSMAttack(AdvAttack): + def __init__( + self, model, snr, loss=None, targeted=False, range_min=None, range_max=None + ): super().__init__(model, loss, targeted, range_min, range_max) self.snr = snr - @property def attack_info(self): info = super().attack_info - new_info = {'snr': self.snr, - 'threat_model': 'snr', - 'attack_type': 'snr-fgsm' } + new_info = {"snr": self.snr, "threat_model": "snr", "attack_type": "snr-fgsm"} info.update(new_info) return info - def generate(self, input, target): input.requires_grad = True @@ -33,16 +30,16 @@ def generate(self, input, target): self.model.zero_grad() loss.backward() dL_x = input.grad.data - - dim = tuple(i for i in range(1,input.dim())) - P_x = 10*torch.log10(torch.mean(input**2, dim=dim, keepdim=True)) + + dim = tuple(i for i in range(1, input.dim())) + P_x = 10 * torch.log10(torch.mean(input ** 2, dim=dim, keepdim=True)) noise = dL_x.sign() - P_n = 10*torch.log10(torch.mean(noise**2, dim=dim, keepdim=True)) + P_n = 10 * torch.log10(torch.mean(noise ** 2, dim=dim, keepdim=True)) snr_0 = P_x - P_n dsnr = self.snr - snr_0 - eps = 10**(-dsnr/20) + eps = 10 ** (-dsnr / 20) f = 1 if self.targeted: @@ -50,5 +47,3 @@ def generate(self, input, target): adv_ex = input + f * eps * noise return self._clamp(adv_ex) - - diff --git a/hyperion/torch/adv_defenses/wave_gan_white.py b/hyperion/torch/adv_defenses/wave_gan_white.py index 4dae44f5..ad7f985e 100644 --- a/hyperion/torch/adv_defenses/wave_gan_white.py +++ b/hyperion/torch/adv_defenses/wave_gan_white.py @@ -11,6 +11,7 @@ import torch import yaml + try: # import parallel_wavegan.models from parallel_wavegan.layers import PQMF @@ -24,8 +25,9 @@ class WaveGANReconstruction(nn.Module): - def __init__(self, feature_extractor, wave_gan, pqmf, use_noise_input, - config, pad_fn): + def __init__( + self, feature_extractor, wave_gan, pqmf, use_noise_input, config, pad_fn + ): super().__init__() self.feature_extractor = feature_extractor self.wave_gan = wave_gan @@ -52,22 +54,22 @@ def forward(self, audio): # Setup inputs inputs = () if use_noise_input: - noise = torch.randn(1, - 1, - len(mel_spectrogram) * config["hop_size"], - device=mel_spectrogram.device) - inputs += (noise, ) - - mel_spectrogram = pad_fn( - mel_spectrogram.unsqueeze(0).transpose(2, 1)) - inputs += (mel_spectrogram, ) + noise = torch.randn( + 1, + 1, + len(mel_spectrogram) * config["hop_size"], + device=mel_spectrogram.device, + ) + inputs += (noise,) + + mel_spectrogram = pad_fn(mel_spectrogram.unsqueeze(0).transpose(2, 1)) + inputs += (mel_spectrogram,) # Generate if config["generator_params"]["out_channels"] == 1: reconstructed_audio = wave_gan(*inputs).view(-1) reconstructed_audio = reconstructed_audio[:num_samples] else: - reconstructed_audio = pqmf.synthesis( - wave_gan(*inputs)).view(-1) + reconstructed_audio = pqmf.synthesis(wave_gan(*inputs)).view(-1) reconstructed_audio = reconstructed_audio[:num_samples] return reconstructed_audio @@ -80,22 +82,21 @@ def forward(self, audio): # Setup inputs inputs = () if use_noise_input: - noise = torch.randn(1, - 1, - len(mel_spectrogram) * - config["hop_size"], - device=recording.device) - inputs += (noise, ) - mel_spectrogram = pad_fn( - mel_spectrogram.unsqueeze(0).transpose(2, 1)) - inputs += (mel_spectrogram, ) + noise = torch.randn( + 1, + 1, + len(mel_spectrogram) * config["hop_size"], + device=recording.device, + ) + inputs += (noise,) + mel_spectrogram = pad_fn(mel_spectrogram.unsqueeze(0).transpose(2, 1)) + inputs += (mel_spectrogram,) # Generate if config["generator_params"]["out_channels"] == 1: reconstructed_audio = wave_gan(*inputs).view(-1) reconstructed_audio = reconstructed_audio[:num_samples] else: - reconstructed_audio = pqmf.synthesis( - wave_gan(*inputs)).view(-1) + reconstructed_audio = pqmf.synthesis(wave_gan(*inputs)).view(-1) reconstructed_audio = reconstructed_audio[:, :num_samples] reconstructions.append(reconstructed_audio) return torch.stack(reconstructions) @@ -104,31 +105,37 @@ def forward(self, audio): class WaveGANDefender(nn.Module): def __init__(self, wave_gan_model_dir: Path, wave_gan_model_ckpt: Path): super().__init__() - with open(wave_gan_model_dir / 'config.yml') as f: + with open(wave_gan_model_dir / "config.yml") as f: self.config = yaml.load(f, Loader=yaml.Loader) self.feature_extractor = WaveGANFeatureExtractor(wave_gan_model_dir) - self.model = ParallelWaveGANGenerator( - **self.config["generator_params"]) + self.model = ParallelWaveGANGenerator(**self.config["generator_params"]) self.model.load_state_dict( - torch.load(wave_gan_model_dir / wave_gan_model_ckpt, - map_location='cpu')["model"]["generator"]) + torch.load(wave_gan_model_dir / wave_gan_model_ckpt, map_location="cpu")[ + "model" + ]["generator"] + ) self.model.remove_weight_norm() - #self.use_noise_input = not isinstance(self.model, parallel_wavegan.models.MelGANGenerator) + # self.use_noise_input = not isinstance(self.model, parallel_wavegan.models.MelGANGenerator) self.use_noise_input = True self.pad_fn = torch.nn.ReplicationPad1d( - self.config["generator_params"].get("aux_context_window", 0)) + self.config["generator_params"].get("aux_context_window", 0) + ) if self.config["generator_params"]["out_channels"] > 1: self.pqmf = PQMF(self.config["generator_params"]["out_channels"]) else: self.pqmf = None - self.reconstructor = WaveGANReconstruction(self.feature_extractor, - self.model, self.pqmf, - self.use_noise_input, - self.config, self.pad_fn) + self.reconstructor = WaveGANReconstruction( + self.feature_extractor, + self.model, + self.pqmf, + self.use_noise_input, + self.config, + self.pad_fn, + ) def forward(self, audio: torch.Tensor) -> torch.Tensor: return self.reconstructor(audio) @@ -157,16 +164,18 @@ def forward(self, audio: torch.Tensor) -> torch.Tensor: # return torch.cat(audio_chunks) -def logmelfilterbank(audio, - sampling_rate, - fft_size=1024, - hop_size=256, - win_length=None, - window="hann", - num_mels=80, - fmin=None, - fmax=None, - eps=1e-10): +def logmelfilterbank( + audio, + sampling_rate, + fft_size=1024, + hop_size=256, + win_length=None, + window="hann", + num_mels=80, + fmin=None, + fmax=None, + eps=1e-10, +): """Compute log-Mel filterbank feature. Args: @@ -197,36 +206,45 @@ def logmelfilterbank(audio, # return np.log10(np.maximum(eps, np.dot(spc, mel_basis.T))) # logger.info('{} {}'.format(audio.shape, audio.device)) - x_stft2 = torch.stft(audio, - n_fft=fft_size, - hop_length=hop_size, - win_length=win_length, - window=window, - pad_mode="reflect").transpose(0, 1)**2 + x_stft2 = ( + torch.stft( + audio, + n_fft=fft_size, + hop_length=hop_size, + win_length=win_length, + window=window, + pad_mode="reflect", + ).transpose(0, 1) + ** 2 + ) # logger.info('{} {}'.format(x_stft2.shape, x_stft2.device)) spc = (x_stft2[:, :, 0] + x_stft2[:, :, 1]).sqrt() # get mel basis fmin = 0 if fmin is None else fmin fmax = sampling_rate / 2 if fmax is None else fmax - mel_basis = torch.tensor(librosa.filters.mel(sampling_rate, fft_size, - num_mels, fmin, fmax), - device=spc.device).transpose(0, 1) + mel_basis = torch.tensor( + librosa.filters.mel(sampling_rate, fft_size, num_mels, fmin, fmax), + device=spc.device, + ).transpose(0, 1) return torch.matmul(spc, mel_basis).clamp(min=eps).log10() class WaveGANFeatureExtractor(nn.Module): def __init__(self, wave_gan_model_dir): super().__init__() - with open(wave_gan_model_dir / 'config.yml') as f: + with open(wave_gan_model_dir / "config.yml") as f: self.config = yaml.load(f, Loader=yaml.Loader) - win_len = self.config["fft_size"] if self.config[ - "win_length"] is None else self.config["win_length"] - self.register_buffer('window', torch.hann_window(win_len)) + win_len = ( + self.config["fft_size"] + if self.config["win_length"] is None + else self.config["win_length"] + ) + self.register_buffer("window", torch.hann_window(win_len)) # Restore scaler - stats_path = str(wave_gan_model_dir / 'stats.h5') + stats_path = str(wave_gan_model_dir / "stats.h5") if self.config["format"] == "hdf5": scaler_mean = read_hdf5(stats_path, "mean") scaler_scale = read_hdf5(stats_path, "scale") @@ -236,20 +254,22 @@ def __init__(self, wave_gan_model_dir): else: raise ValueError("support only hdf5 or npy format.") - self.register_buffer('scaler_mean', torch.tensor(scaler_mean)) - self.register_buffer('scaler_scale', torch.tensor(scaler_scale)) + self.register_buffer("scaler_mean", torch.tensor(scaler_mean)) + self.register_buffer("scaler_scale", torch.tensor(scaler_scale)) def transform(self, audio): - mel = logmelfilterbank(audio, - sampling_rate=self.config["sampling_rate"], - hop_size=self.config["hop_size"], - fft_size=self.config["fft_size"], - win_length=self.config["win_length"], - window=self.window, - num_mels=self.config["num_mels"], - fmin=self.config["fmin"], - fmax=self.config["fmax"]) + mel = logmelfilterbank( + audio, + sampling_rate=self.config["sampling_rate"], + hop_size=self.config["hop_size"], + fft_size=self.config["fft_size"], + win_length=self.config["win_length"], + window=self.window, + num_mels=self.config["num_mels"], + fmin=self.config["fmin"], + fmax=self.config["fmax"], + ) # Normalize the mel spectrogram mel = (mel - self.scaler_mean) / self.scaler_scale diff --git a/hyperion/torch/data/__init__.py b/hyperion/torch/data/__init__.py index 469bb650..4deb3f25 100644 --- a/hyperion/torch/data/__init__.py +++ b/hyperion/torch/data/__init__.py @@ -9,5 +9,5 @@ from .audio_dataset import AudioDataset -#samplers +# samplers from .weighted_seq_sampler import ClassWeightedSeqSampler diff --git a/hyperion/torch/data/paired_feat_seq_dataset.py b/hyperion/torch/data/paired_feat_seq_dataset.py index 5162abc2..671bb6bf 100644 --- a/hyperion/torch/data/paired_feat_seq_dataset.py +++ b/hyperion/torch/data/paired_feat_seq_dataset.py @@ -13,38 +13,44 @@ from ...utils.utt2info import Utt2Info from .feat_seq_dataset import FeatSeqDataset -class PairedFeatSeqDataset(FeatSeqDataset): - - def __init__(self, rspecifier, key_file, - pairs_file, - class_file = None, - num_frames_file = None, - path_prefix=None, - min_chunk_length=1, - max_chunk_length=None, - return_fullseqs=False, - return_class=True, transpose_input=True, is_val=False): +class PairedFeatSeqDataset(FeatSeqDataset): + def __init__( + self, + rspecifier, + key_file, + pairs_file, + class_file=None, + num_frames_file=None, + path_prefix=None, + min_chunk_length=1, + max_chunk_length=None, + return_fullseqs=False, + return_class=True, + transpose_input=True, + is_val=False, + ): super().__init__( - rspecifier, key_file, + rspecifier, + key_file, class_file=class_file, num_frames_file=num_frames_file, path_prefix=path_prefix, min_chunk_length=min_chunk_length, max_chunk_length=max_chunk_length, return_fullseqs=return_fullseqs, - return_class=return_class, transpose_input=transpose_input, - is_val=is_val) + return_class=return_class, + transpose_input=transpose_input, + is_val=is_val, + ) - logging.info('loading utt pairs file %s' % key_file) - u2pair = Utt2Info.load(pairs_file, sep=' ') + logging.info("loading utt pairs file %s" % key_file) + u2pair = Utt2Info.load(pairs_file, sep=" ") u2pair_dict = {} - for u,p in u2pair: + for u, p in u2pair: u2pair_dict[u] = p self.u2pair = u2pair_dict - - def _get_fullseq(self, index): key = self.u2c.key[index] @@ -56,11 +62,10 @@ def _get_fullseq(self, index): x_pair = x_pair.T if not self.return_class: return x, x_pair - + class_idx = self.utt_idx2class[index] return x, x_pair, class_idx - def _get_random_chunk(self, index): if len(index) == 2: @@ -70,16 +75,18 @@ def _get_random_chunk(self, index): key = self.u2c.key[index] full_seq_length = int(self.seq_lengths[index]) - assert chunk_length <= full_seq_length, 'chunk_length(%d) <= full_seq_length(%d)' % ( - chunk_length, full_seq_length) + assert ( + chunk_length <= full_seq_length + ), "chunk_length(%d) <= full_seq_length(%d)" % (chunk_length, full_seq_length) first_frame = torch.randint( - low=0, high=full_seq_length-chunk_length+1, size=(1,)).item() + low=0, high=full_seq_length - chunk_length + 1, size=(1,) + ).item() - x = self.r.read([key], row_offset=first_frame, - num_rows=chunk_length)[0] + x = self.r.read([key], row_offset=first_frame, num_rows=chunk_length)[0] key_pair = self.u2pair[key] - x_pair = self.r.read([key_pair], row_offset=first_frame, - num_rows=chunk_length)[0] + x_pair = self.r.read([key_pair], row_offset=first_frame, num_rows=chunk_length)[ + 0 + ] x = x.astype(floatstr_torch(), copy=False) x_pair = x_pair.astype(floatstr_torch(), copy=False) @@ -89,7 +96,6 @@ def _get_random_chunk(self, index): if not self.return_class: return x, x_pair - + class_idx = self.utt_idx2class[index] return x, x_pair, class_idx - diff --git a/hyperion/torch/data/weighted_embed_sampler.py b/hyperion/torch/data/weighted_embed_sampler.py index fa8d1c38..61e4a0ad 100644 --- a/hyperion/torch/data/weighted_embed_sampler.py +++ b/hyperion/torch/data/weighted_embed_sampler.py @@ -13,31 +13,25 @@ class ClassWeightedEmbedSampler(Sampler): + def __init__(self, dataset, batch_size=1, iters_per_epoch=1, num_egs_per_class=1): - def __init__(self, dataset, batch_size=1, iters_per_epoch=1, - num_egs_per_class=1): - super().__init__(None) self.dataset = dataset self.batch_size = batch_size self.num_egs_per_class = num_egs_per_class self.batch = 0 - + self.iters_per_epoch = iters_per_epoch - self._len = int(math.ceil( - self.iters_per_epoch * len(dataset) / batch_size)) + self._len = int(math.ceil(self.iters_per_epoch * len(dataset) / batch_size)) - logging.info('num batches per epoch: %d' % self._len) - - self._num_classes_per_batch = int(math.ceil( - batch_size/num_egs_per_class)) - logging.info('num classes per batch: %d' % self._num_classes_per_batch) + logging.info("num batches per epoch: %d" % self._len) + self._num_classes_per_batch = int(math.ceil(batch_size / num_egs_per_class)) + logging.info("num classes per batch: %d" % self._num_classes_per_batch) def __len__(self): return self._len - - + def __iter__(self): self.batch = 0 return self @@ -46,40 +40,45 @@ def _remove_duplicate_idx(self, utt_idx): utt_idx_uniq = torch.unique(utt_idx) c = 0 # we make 3 tries to remove duplicate utt idx - delta = len(utt_idx) - len(utt_idx_uniq) - while delta > 0 and c < 3: + delta = len(utt_idx) - len(utt_idx_uniq) + while delta > 0 and c < 3: extra_idx = torch.randint(low=0, high=len(self.dataset), size=(delta,)) utt_idx = torch.cat((utt_idx_uniq, extra_idx)) utt_idx_uniq = torch.unique(utt_idx) - delta = len(utt_idx) - len(utt_idx_uniq) - c +=1 + delta = len(utt_idx) - len(utt_idx_uniq) + c += 1 return utt_idx - def _get_utt_idx(self): dataset = self.dataset num_classes_per_batch = self._num_classes_per_batch if dataset.class_weights is None: - class_idx = torch.randint(low=0, high=dataset.num_classes, - size=(num_classes_per_batch,)) + class_idx = torch.randint( + low=0, high=dataset.num_classes, size=(num_classes_per_batch,) + ) else: class_idx = torch.multinomial( - dataset.class_weights, - num_samples=num_classes_per_batch, replacement=True) + dataset.class_weights, + num_samples=num_classes_per_batch, + replacement=True, + ) if self.num_egs_per_class > 1: class_idx = class_idx.repeat(self.num_egs_per_class) - utt_idx = torch.as_tensor([ - dataset.class2utt_idx[c][ - torch.randint(low=0, high=int(dataset.class2num_utt[c]), size=(1,))] - for c in class_idx.tolist()]) + utt_idx = torch.as_tensor( + [ + dataset.class2utt_idx[c][ + torch.randint(low=0, high=int(dataset.class2num_utt[c]), size=(1,)) + ] + for c in class_idx.tolist() + ] + ) utt_idx = self._remove_duplicate_idx(utt_idx) return utt_idx - def __next__(self): if self.batch == self._len: @@ -87,7 +86,7 @@ def __next__(self): utt_idx = self._get_utt_idx() if self.batch == 0: - logging.info('batch 0 uttidx=%s', str(utt_idx[:10])) + logging.info("batch 0 uttidx=%s", str(utt_idx[:10])) self.batch += 1 return utt_idx.tolist() diff --git a/hyperion/torch/data/weighted_seq_sampler.py b/hyperion/torch/data/weighted_seq_sampler.py index 3da5e70e..9d128bb8 100644 --- a/hyperion/torch/data/weighted_seq_sampler.py +++ b/hyperion/torch/data/weighted_seq_sampler.py @@ -13,11 +13,18 @@ from torch.utils.data import Sampler import torch.distributed as dist + class ClassWeightedSeqSampler(Sampler): + def __init__( + self, + dataset, + batch_size=1, + iters_per_epoch="auto", + num_egs_per_class=1, + num_egs_per_utt=1, + var_batch_size=False, + ): - def __init__(self, dataset, batch_size=1, iters_per_epoch='auto', - num_egs_per_class=1, num_egs_per_utt=1, var_batch_size=False): - super().__init__(None) try: @@ -42,8 +49,8 @@ def __init__(self, dataset, batch_size=1, iters_per_epoch='auto', # when using ddp dummy = torch.rand(1000 * rank) del dummy - - if iters_per_epoch == 'auto': + + if iters_per_epoch == "auto": self._compute_iters_auto() else: self.iters_per_epoch = iters_per_epoch @@ -53,90 +60,99 @@ def __init__(self, dataset, batch_size=1, iters_per_epoch='auto', else: avg_batch_size = self.batch_size - self._len = int(math.ceil( - self.iters_per_epoch * dataset.num_seqs / avg_batch_size / world_size)) + self._len = int( + math.ceil( + self.iters_per_epoch * dataset.num_seqs / avg_batch_size / world_size + ) + ) + + logging.info("num batches per epoch: %d" % self._len) - logging.info('num batches per epoch: %d' % self._len) - - self._num_classes_per_batch = int(math.ceil( - batch_size/num_egs_per_class/num_egs_per_utt)) - logging.info('num classes per batch: %d' % self._num_classes_per_batch) + self._num_classes_per_batch = int( + math.ceil(batch_size / num_egs_per_class / num_egs_per_utt) + ) + logging.info("num classes per batch: %d" % self._num_classes_per_batch) - #self.weights = torch.as_tensor(dataset.class_weights, dtype=torch.double) - + # self.weights = torch.as_tensor(dataset.class_weights, dtype=torch.double) def _compute_avg_batch_size(self): dataset = self.dataset - avg_chunk_length = int((dataset.max_chunk_length + dataset.min_chunk_length)/2) - batch_mult = dataset.max_chunk_length/avg_chunk_length + avg_chunk_length = int( + (dataset.max_chunk_length + dataset.min_chunk_length) / 2 + ) + batch_mult = dataset.max_chunk_length / avg_chunk_length return int(self.batch_size * batch_mult) - - + def _compute_iters_auto(self): dataset = self.dataset avg_seq_length = np.mean(dataset.seq_lengths) - avg_chunk_length = int((dataset.max_chunk_length + dataset.min_chunk_length)/2) - self.iters_per_epoch = math.ceil(avg_seq_length/avg_chunk_length) - logging.debug('num iters per epoch: %d' % self.iters_per_epoch) - + avg_chunk_length = int( + (dataset.max_chunk_length + dataset.min_chunk_length) / 2 + ) + self.iters_per_epoch = math.ceil(avg_seq_length / avg_chunk_length) + logging.debug("num iters per epoch: %d" % self.iters_per_epoch) def __len__(self): return self._len - - def __iter__(self): self.batch = 0 return self - def _get_utt_idx_basic(self, batch_mult=1): dataset = self.dataset num_classes_per_batch = batch_mult * self._num_classes_per_batch if dataset.class_weights is None: - class_idx = torch.randint(low=0, high=dataset.num_classes, - size=(num_classes_per_batch,)) + class_idx = torch.randint( + low=0, high=dataset.num_classes, size=(num_classes_per_batch,) + ) else: class_idx = torch.multinomial( - dataset.class_weights, - num_samples=num_classes_per_batch, replacement=True) + dataset.class_weights, + num_samples=num_classes_per_batch, + replacement=True, + ) if self.num_egs_per_class > 1: class_idx = class_idx.repeat(self.num_egs_per_class) - utt_idx = torch.as_tensor([ - dataset.class2utt_idx[c][ - torch.randint(low=0, high=int(dataset.class2num_utt[c]), size=(1,))] - for c in class_idx.tolist()]) - - return utt_idx - + utt_idx = torch.as_tensor( + [ + dataset.class2utt_idx[c][ + torch.randint(low=0, high=int(dataset.class2num_utt[c]), size=(1,)) + ] + for c in class_idx.tolist() + ] + ) + return utt_idx def _get_utt_idx_seq_st_max_length(self, chunk_length, batch_mult=1): dataset = self.dataset num_classes_per_batch = batch_mult * self._num_classes_per_batch - #first we sample the batch classes + # first we sample the batch classes class_weights = dataset.class_weights.clone() # get classes with utt shorter than chunk lenght class_weights[dataset.class2max_length < chunk_length] = 0 - + # renormalize weights and sample class_weights /= class_weights.sum() - #logging.info(str(class_weights)) + # logging.info(str(class_weights)) class_idx = torch.multinomial( - class_weights, - num_samples=num_classes_per_batch, replacement=True) + class_weights, num_samples=num_classes_per_batch, replacement=True + ) - utt_idx = torch.zeros((len(class_idx)*self.num_egs_per_class,), dtype=torch.long) - k=0 + utt_idx = torch.zeros( + (len(class_idx) * self.num_egs_per_class,), dtype=torch.long + ) + k = 0 for c in class_idx.tolist(): - #for each class we sample an utt between the utt longer than chunk length - + # for each class we sample an utt between the utt longer than chunk length + # get utts for class c utt_idx_c = torch.as_tensor(dataset.class2utt_idx[c]) @@ -145,22 +161,22 @@ def _get_utt_idx_seq_st_max_length(self, chunk_length, batch_mult=1): utt_weights = torch.ones((int(dataset.class2num_utt[c]),)) utt_weights[seq_lengths_c < chunk_length] = 0 utt_weights /= utt_weights.sum() - - #sample utt idx + + # sample utt idx try: - utt_idx[k:k+self.num_egs_per_class] = utt_idx_c[ + utt_idx[k : k + self.num_egs_per_class] = utt_idx_c[ torch.multinomial( - utt_weights, - num_samples=self.num_egs_per_class, replacement=True)] + utt_weights, + num_samples=self.num_egs_per_class, + replacement=True, + ) + ] except: - logging.info('{} {}'.format(seq_lengths_c, utt_weights)) + logging.info("{} {}".format(seq_lengths_c, utt_weights)) k += self.num_egs_per_class - - return utt_idx - - + return utt_idx def __next__(self): @@ -170,7 +186,7 @@ def __next__(self): chunk_length = self.dataset.get_random_chunk_length() if self.var_batch_size: - batch_mult = int(self.dataset.max_chunk_length//chunk_length) + batch_mult = int(self.dataset.max_chunk_length // chunk_length) else: batch_mult = 1 @@ -178,72 +194,76 @@ def __next__(self): utt_idx = self._get_utt_idx_seq_st_max_length(chunk_length, batch_mult) else: utt_idx = self._get_utt_idx_basic(batch_mult) - + if self.num_egs_per_utt > 1: utt_idx = utt_idx.repeat(self.num_egs_per_utt) - utt_idx = utt_idx.tolist()[:self.batch_size * batch_mult] + utt_idx = utt_idx.tolist()[: self.batch_size * batch_mult] if self.batch == 0: - logging.info('batch 0 uttidx=%s', str(utt_idx[:10])) + logging.info("batch 0 uttidx=%s", str(utt_idx[:10])) self.batch += 1 index = [(i, chunk_length) for i in utt_idx] return index - @staticmethod def filter_args(**kwargs): - if 'no_shuffle_seqs' in kwargs: - kwargs['shuffle_seqs'] = not kwargs['no_shuffle_seqs'] - - valid_args = ('batch_size', 'var_batch_size', - 'iters_per_epoch', - 'num_egs_per_class', 'num_egs_per_utt') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - + if "no_shuffle_seqs" in kwargs: + kwargs["shuffle_seqs"] = not kwargs["no_shuffle_seqs"] + valid_args = ( + "batch_size", + "var_batch_size", + "iters_per_epoch", + "num_egs_per_class", + "num_egs_per_utt", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - - parser.add_argument( - '--batch-size', - default=128, type=int, - help=('batch size')) + parser = ArgumentParser(prog="") + + parser.add_argument("--batch-size", default=128, type=int, help=("batch size")) parser.add_argument( - '--var-batch-size', default=False, - action='store_true', - help=('use variable batch-size, ' - 'then batch-size is the minimum batch size, ' - 'which is used when the batch chunk length is ' - 'equal to max-chunk-length')) + "--var-batch-size", + default=False, + action="store_true", + help=( + "use variable batch-size, " + "then batch-size is the minimum batch size, " + "which is used when the batch chunk length is " + "equal to max-chunk-length" + ), + ) parser.add_argument( - '--iters-per-epoch', - default='auto', - type=lambda x: x if x=='auto' else float(x), - help=('number of times we sample an utterance in each epoch')) + "--iters-per-epoch", + default="auto", + type=lambda x: x if x == "auto" else float(x), + help=("number of times we sample an utterance in each epoch"), + ) parser.add_argument( - '--num-egs-per-class', - type=int, default=1, - help=('number of samples per class in batch')) + "--num-egs-per-class", + type=int, + default=1, + help=("number of samples per class in batch"), + ) parser.add_argument( - '--num-egs-per-utt', - type=int, default=1, - help=('number of samples per utterance in batch')) + "--num-egs-per-utt", + type=int, + default=1, + help=("number of samples per utterance in batch"), + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='weighted seq sampler options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='weighted seq sampler options') add_argparse_args = add_class_args diff --git a/hyperion/torch/helpers/__init__.py b/hyperion/torch/helpers/__init__.py index 0bc7d412..1677eef0 100644 --- a/hyperion/torch/helpers/__init__.py +++ b/hyperion/torch/helpers/__init__.py @@ -4,5 +4,5 @@ """ # from .optimizer_factory import OptimizerFactory -#from .torch_na_loader import TorchNALoader -#from .torch_model_loader import TorchModelLoader +# from .torch_na_loader import TorchNALoader +# from .torch_model_loader import TorchModelLoader diff --git a/hyperion/torch/layer_blocks/conformer_conv.py b/hyperion/torch/layer_blocks/conformer_conv.py index 2b2de63a..7ed9a43a 100644 --- a/hyperion/torch/layer_blocks/conformer_conv.py +++ b/hyperion/torch/layer_blocks/conformer_conv.py @@ -9,26 +9,35 @@ from ..layers import ActivationFactory as AF from .se_blocks import SEBlock1d + def _conv1(in_channels, out_channels, bias=False): """1x1 convolution""" return nn.Conv1d(in_channels, out_channels, kernel_size=1, bias=bias) + def _dwconvk(channels, kernel_size, stride=1, bias=False): """kxk depth-wise convolution with padding""" - return nn.Conv1d(channels, channels, kernel_size=kernel_size, stride=stride, - padding=(kernel_size-1)//2, groups=channels, bias=bias, - padding_mode='zeros') + return nn.Conv1d( + channels, + channels, + kernel_size=kernel_size, + stride=stride, + padding=(kernel_size - 1) // 2, + groups=channels, + bias=bias, + padding_mode="zeros", + ) def _make_downsample(in_channels, out_channels, stride): - return _conv1(in_channels, out_channels, stride, bias=True) + return _conv1(in_channels, out_channels, stride, bias=True) class ConformerConvBlock(nn.Module): - """ Convolutional block for conformer introduced at + """Convolutional block for conformer introduced at https://arxiv.org/pdf/2005.08100.pdf - This includes some optional extra features + This includes some optional extra features not included in the original paper: - Squeeze-Excitation after depthwise-conv - Allows downsampling in time dimension @@ -39,16 +48,25 @@ class ConformerConvBlock(nn.Module): kernel_size: kernel_size for depth-wise conv stride: stride for depth-wise conv activation: activation function str or object - norm_layer: norm layer constructor, + norm_layer: norm layer constructor, if None it uses BatchNorm dropout_rate: dropout rate se_r: Squeeze-Excitation compression ratio, if None it doesn't use Squeeze-Excitation """ - def __init__(self, num_channels, kernel_size, stride=1, activation='swish', - norm_layer=None, dropout_rate=0, se_r=None): + + def __init__( + self, + num_channels, + kernel_size, + stride=1, + activation="swish", + norm_layer=None, + dropout_rate=0, + se_r=None, + ): super().__init__() - self.num_channels = num_channels, + self.num_channels = (num_channels,) self.kernel_size = kernel_size self.stride = stride self.dropout_rate = dropout_rate @@ -56,17 +74,15 @@ def __init__(self, num_channels, kernel_size, stride=1, activation='swish', self.se_r = se_r self.has_se = se_r is not None and se_r > 1 - if norm_layer is None: norm_layer = nn.BatchNorm1d self.layer_norm = nn.LayerNorm(num_channels) # expansion phase - self.conv_exp = _conv1(num_channels, 2*num_channels, bias=True) + self.conv_exp = _conv1(num_channels, 2 * num_channels, bias=True) - #depthwise conv phase - self.conv_dw = _dwconvk(num_channels, kernel_size, - stride=stride, bias=False) + # depthwise conv phase + self.conv_dw = _dwconvk(num_channels, kernel_size, stride=stride, bias=False) self.norm_dw = norm_layer(num_channels, momentum=0.01, eps=1e-3) if self.has_se: self.se_layer = SEBlock1d(num_channels, se_r, activation) @@ -82,12 +98,11 @@ def __init__(self, num_channels, kernel_size, stride=1, activation='swish', if stride != 1: self.downsample = _make_downsample(num_channels, num_channels, stride) - self.context = stride*(kernel_size-1)//2 - + self.context = stride * (kernel_size - 1) // 2 def forward(self, x): - """ Forward function - + """Forward function + Args: x: input size = (batch, num_channels, time) @@ -95,9 +110,9 @@ def forward(self, x): torch.Tensor size = (batch, num_channels, (time-1)//stride+1) """ residual = x - + # layer norm - x = self.layer_norm(x.transpose(1,2)).transpose(1,2) + x = self.layer_norm(x.transpose(1, 2)).transpose(1, 2) # expansion + glu x = self.conv_exp(x) @@ -108,7 +123,7 @@ def forward(self, x): if self.has_se: x = self.se_layer(x) - # final projection + # final projection x = self.conv_proj(x) if self.dropout_rate > 0: x = self.dropout(x) diff --git a/hyperion/torch/layer_blocks/conformer_encoder_v1.py b/hyperion/torch/layer_blocks/conformer_encoder_v1.py index cb23a4f7..a54e3b99 100644 --- a/hyperion/torch/layer_blocks/conformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/conformer_encoder_v1.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# +# import torch import torch.nn as nn @@ -11,13 +11,14 @@ from .transformer_feedforward import * from .conformer_conv import ConformerConvBlock + class ConformerEncoderBlockV1(nn.Module): """Building block for conformer encoder introduced in https://arxiv.org/pdf/2005.08100.pdf - This includes some optional extra features + This includes some optional extra features not included in the original paper: - - Choose local-attention (attending only to close frames + - Choose local-attention (attending only to close frames instead of all the frames in the sequence) - Choose number of conv blocks - Squeeze-Excitation after depthwise-conv @@ -35,18 +36,18 @@ class ConformerEncoderBlockV1(nn.Module): feed_forward: position-wise feed-forward string in ['linear', 'conv1dx2', 'conv1d-linear'] d_ff: dimension of middle layer in feed_forward block ff_kernel_size: kernel size for convolutional versions of ff block - hid_act: ff and conv block hidden activation + hid_act: ff and conv block hidden activation dropout_rate: dropout rate for ff and conv blocks att_context: maximum context range for local attention att_dropout_rate: dropout rate for attention block causal_pos_enc: if True, use causal positional encodings (when rel_pos_enc=True), it assumes that query q_i only attents to key k_j when j<=i - conv_norm_layer: norm layer constructor for conv block, + conv_norm_layer: norm layer constructor for conv block, if None it uses BatchNorm se_r: Squeeze-Excitation compression ratio, if None it doesn't use Squeeze-Excitation ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. - out_lnorm: if True, use LNorm layer at the output as in the conformer paper, + out_lnorm: if True, use LNorm layer at the output as in the conformer paper, we think that this layer is redundant and put it to False by default concat_after: if True, if concats attention input and output and apply linear transform, i.e., y = x + linear(concat(x, att(x))) @@ -54,39 +55,65 @@ class ConformerEncoderBlockV1(nn.Module): """ - def __init__(self, num_feats, self_attn, num_heads, - conv_repeats=1, conv_kernel_size=31, conv_stride=1, - feed_forward='linear', d_ff=2048, ff_kernel_size=3, - hid_act='swish', dropout_rate=0, - att_context=25, att_dropout_rate=0, - pos_enc_type='rel', causal_pos_enc=False, - conv_norm_layer=None, se_r=None, - ff_macaron=True, out_lnorm=False, concat_after=False): + def __init__( + self, + num_feats, + self_attn, + num_heads, + conv_repeats=1, + conv_kernel_size=31, + conv_stride=1, + feed_forward="linear", + d_ff=2048, + ff_kernel_size=3, + hid_act="swish", + dropout_rate=0, + att_context=25, + att_dropout_rate=0, + pos_enc_type="rel", + causal_pos_enc=False, + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + out_lnorm=False, + concat_after=False, + ): super().__init__() self.self_attn = self._make_att( - self_attn, num_feats, num_heads, att_context, att_dropout_rate, - pos_enc_type, causal_pos_enc) - + self_attn, + num_feats, + num_heads, + att_context, + att_dropout_rate, + pos_enc_type, + causal_pos_enc, + ) + self.ff_scale = 1 self.ff_macaron = ff_macaron if ff_macaron: self.ff_scale = 0.5 self.feed_forward_macaron = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, - hid_act, dropout_rate) + feed_forward, num_feats, d_ff, ff_kernel_size, hid_act, dropout_rate + ) self.norm_ff_macaron = nn.LayerNorm(num_feats) self.feed_forward = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, - hid_act, dropout_rate) + feed_forward, num_feats, d_ff, ff_kernel_size, hid_act, dropout_rate + ) conv_blocks = [] for i in range(conv_repeats): block_i = ConformerConvBlock( - num_feats, conv_kernel_size, conv_stride, - activation=hid_act, norm_layer=conv_norm_layer, - dropout_rate=dropout_rate, se_r=se_r) + num_feats, + conv_kernel_size, + conv_stride, + activation=hid_act, + norm_layer=conv_norm_layer, + dropout_rate=dropout_rate, + se_r=se_r, + ) conv_stride = 1 conv_blocks.append(block_i) @@ -105,10 +132,16 @@ def __init__(self, num_feats, self_attn, num_heads, if self.concat_after: self.concat_linear = nn.Linear(num_feats + num_feats, num_feats) - @staticmethod - def _make_att(att_type, num_feats, num_heads, context, - dropout_rate, pos_enc_type, causal_pos_enc): + def _make_att( + att_type, + num_feats, + num_heads, + context, + dropout_rate, + pos_enc_type, + causal_pos_enc, + ): """Creates multihead attention block from att_type string Args: @@ -123,31 +156,52 @@ def _make_att(att_type, num_feats, num_heads, context, Returns: Attention nn.Module """ - + assert num_feats % num_heads == 0 d_k = num_feats // num_heads - if att_type == 'scaled-dot-prod-v1': - if pos_enc_type == 'rel': + if att_type == "scaled-dot-prod-v1": + if pos_enc_type == "rel": return ScaledDotProdAttRelPosEncV1( - num_feats, num_feats, num_heads, d_k, d_k, - causal_pos_enc, dropout_rate, time_dim=1) + num_feats, + num_feats, + num_heads, + d_k, + d_k, + causal_pos_enc, + dropout_rate, + time_dim=1, + ) return ScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, - dropout_rate, time_dim=1) + num_feats, num_feats, num_heads, d_k, d_k, dropout_rate, time_dim=1 + ) - if att_type == 'local-scaled-dot-prod-v1': - if pos_enc_type == 'rel': + if att_type == "local-scaled-dot-prod-v1": + if pos_enc_type == "rel": return LocalScaledDotProdAttRelPosEncV1( - num_feats, num_feats, num_heads, d_k, d_k, - context, causal_pos_enc, dropout_rate, time_dim=1) + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + causal_pos_enc, + dropout_rate, + time_dim=1, + ) return LocalScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, - context, dropout_rate, time_dim=1) + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + dropout_rate, + time_dim=1, + ) - @staticmethod def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rate): """Creates position-wise feed forward block from ff_type string @@ -159,24 +213,25 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rat kernel_size: kernel size for convolutional versions of ff block dropout_rate: dropout rate for ff block activation: activation function for ff block - + Returns: - Position-wise feed-forward nn.Module + Position-wise feed-forward nn.Module """ - if ff_type == 'linear': + if ff_type == "linear": return PositionwiseFeedForward( - num_feats, hid_feats, activation, dropout_rate, time_dim=1) + num_feats, hid_feats, activation, dropout_rate, time_dim=1 + ) - if ff_type == 'conv1dx2': + if ff_type == "conv1dx2": return Conv1dx2( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1) + num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 + ) - if ff_type == 'conv1d-linear': + if ff_type == "conv1d-linear": return Conv1dLinear( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1) - - + num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 + ) def forward(self, x, pos_emb=None, mask=None): """Forward pass function @@ -220,11 +275,11 @@ def forward(self, x, pos_emb=None, mask=None): x = residual + x # convolutional blocks - x = x.transpose(1,2) + x = x.transpose(1, 2) for block in range(len(self.conv_blocks)): x = self.conv_blocks[block](x) - x = x.transpose(1,2) + x = x.transpose(1, 2) # feed-forward block residual = x @@ -238,5 +293,5 @@ def forward(self, x, pos_emb=None, mask=None): # output norm if self.out_lnorm: x = self.norm_out(x) - + return x, mask diff --git a/hyperion/torch/layer_blocks/dc1d_blocks.py b/hyperion/torch/layer_blocks/dc1d_blocks.py index 8b5f19e0..f5b794ef 100644 --- a/hyperion/torch/layer_blocks/dc1d_blocks.py +++ b/hyperion/torch/layer_blocks/dc1d_blocks.py @@ -12,19 +12,26 @@ class DC1dEncBlock(nn.Module): - - def __init__(self, in_channels, out_channels, - kernel_size, stride=1, dilation=1, - activation='relu', - dropout_rate=0, - use_norm=True, norm_layer=None, norm_before=True): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + activation="relu", + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.activation = AF.create(activation) - padding = int(dilation * (kernel_size -1)/2) + padding = int(dilation * (kernel_size - 1) / 2) - self.dropout_rate =dropout_rate + self.dropout_rate = dropout_rate self.dropout = None if dropout_rate > 0: self.dropout = Dropout1d(dropout_rate) @@ -35,35 +42,33 @@ def __init__(self, in_channels, out_channels, if norm_layer is None: norm_layer = BatchNorm1d - self.bn1 = norm_layer(out_channels) + self.bn1 = norm_layer(out_channels) if norm_before: self.norm_before = True else: self.norm_after = True self.conv1 = Conv1d( - in_channels, out_channels, + in_channels, + out_channels, bias=(not self.norm_before), - kernel_size=kernel_size, + kernel_size=kernel_size, stride=stride, - dilation=dilation, - padding=padding) + dilation=dilation, + padding=padding, + ) self.stride = stride - self.context = dilation*(kernel_size-1)//2 - + self.context = dilation * (kernel_size - 1) // 2 def freeze(self): for param in self.parameters(): param.requires_grad = False - def unfreeze(self): for param in self.parameters(): param.requires_grad = True - - def forward(self, x): x = self.conv1(x) @@ -72,7 +77,7 @@ def forward(self, x): if self.activation is not None: x = self.activation(x) - + if self.norm_after: x = self.bn1(x) @@ -82,21 +87,27 @@ def forward(self, x): return x - class DC1dDecBlock(nn.Module): - - def __init__(self, in_channels, out_channels, - kernel_size, stride=1, dilation=1, - activation='relu', - dropout_rate=0, - use_norm=True, norm_layer=None, norm_before=True): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + activation="relu", + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.activation = AF.create(activation) - padding = int(dilation * (kernel_size -1)/2) + padding = int(dilation * (kernel_size - 1) / 2) - self.dropout_rate =dropout_rate + self.dropout_rate = dropout_rate self.dropout = None if dropout_rate > 0: self.dropout = Dropout1d(dropout_rate) @@ -107,7 +118,7 @@ def __init__(self, in_channels, out_channels, if norm_layer is None: norm_layer = BatchNorm1d - self.bn1 = norm_layer(out_channels) + self.bn1 = norm_layer(out_channels) if norm_before: self.norm_before = True else: @@ -115,36 +126,36 @@ def __init__(self, in_channels, out_channels, if stride == 1: self.conv1 = Conv1d( - in_channels, out_channels, - kernel_size=kernel_size, - stride=1, - dilation=dilation, - bias=(not self.norm_before), - padding=padding) + in_channels, + out_channels, + kernel_size=kernel_size, + stride=1, + dilation=dilation, + bias=(not self.norm_before), + padding=padding, + ) else: self.conv1 = SubPixelConv1d( - in_channels, out_channels, - kernel_size=kernel_size, + in_channels, + out_channels, + kernel_size=kernel_size, stride=stride, - dilation=dilation, + dilation=dilation, bias=(not self.norm_before), - padding=padding) + padding=padding, + ) self.stride = stride - self.context = dilation*(kernel_size-1)//2 - + self.context = dilation * (kernel_size - 1) // 2 def freeze(self): for param in self.parameters(): param.requires_grad = False - def unfreeze(self): for param in self.parameters(): param.requires_grad = True - - def forward(self, x): x = self.conv1(x) @@ -153,7 +164,7 @@ def forward(self, x): if self.activation is not None: x = self.activation(x) - + if self.norm_after: x = self.bn1(x) @@ -161,7 +172,3 @@ def forward(self, x): x = self.dropout(x) return x - - - - diff --git a/hyperion/torch/layer_blocks/dc2d_blocks.py b/hyperion/torch/layer_blocks/dc2d_blocks.py index e6012743..0d251528 100644 --- a/hyperion/torch/layer_blocks/dc2d_blocks.py +++ b/hyperion/torch/layer_blocks/dc2d_blocks.py @@ -11,19 +11,26 @@ class DC2dEncBlock(nn.Module): - - def __init__(self, in_channels, out_channels, - kernel_size, stride=1, dilation=1, - activation='relu', - dropout_rate=0, - use_norm=True, norm_layer=None, norm_before=True): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + activation="relu", + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.activation = AF.create(activation) - padding = int(dilation * (kernel_size -1)/2) + padding = int(dilation * (kernel_size - 1) / 2) - self.dropout_rate =dropout_rate + self.dropout_rate = dropout_rate self.dropout = None if dropout_rate > 0: self.dropout = Dropout2d(dropout_rate) @@ -41,28 +48,26 @@ def __init__(self, in_channels, out_channels, self.norm_after = True self.conv1 = Conv2d( - in_channels, out_channels, + in_channels, + out_channels, bias=(not self.norm_before), - kernel_size=kernel_size, + kernel_size=kernel_size, stride=stride, - dilation=dilation, - padding=padding) + dilation=dilation, + padding=padding, + ) self.stride = stride - self.context = dilation*(kernel_size-1)//2 - + self.context = dilation * (kernel_size - 1) // 2 def freeze(self): for param in self.parameters(): param.requires_grad = False - def unfreeze(self): for param in self.parameters(): param.requires_grad = True - - def forward(self, x): x = self.conv1(x) @@ -71,7 +76,7 @@ def forward(self, x): if self.activation is not None: x = self.activation(x) - + if self.norm_after: x = self.bn1(x) @@ -81,21 +86,27 @@ def forward(self, x): return x - class DC2dDecBlock(nn.Module): - - def __init__(self, in_channels, out_channels, - kernel_size, stride=1, dilation=1, - activation='relu', - dropout_rate=0, - use_norm=True, norm_layer=None, norm_before=True): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + dilation=1, + activation="relu", + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.activation = AF.create(activation) - padding = int(dilation * (kernel_size -1)/2) + padding = int(dilation * (kernel_size - 1) / 2) - self.dropout_rate =dropout_rate + self.dropout_rate = dropout_rate self.dropout = None if dropout_rate > 0: self.dropout = Dropout2d(dropout_rate) @@ -114,36 +125,36 @@ def __init__(self, in_channels, out_channels, if stride == 1: self.conv1 = Conv2d( - in_channels, out_channels, - kernel_size=kernel_size, - stride=1, - dilation=dilation, - bias=(not self.norm_before), - padding=padding) #pytorch > 1.0 + in_channels, + out_channels, + kernel_size=kernel_size, + stride=1, + dilation=dilation, + bias=(not self.norm_before), + padding=padding, + ) # pytorch > 1.0 else: self.conv1 = SubPixelConv2d( - in_channels, out_channels, - kernel_size=kernel_size, + in_channels, + out_channels, + kernel_size=kernel_size, stride=stride, - dilation=dilation, + dilation=dilation, bias=(not self.norm_before), - padding=padding) + padding=padding, + ) self.stride = stride - self.context = dilation*(kernel_size-1)//2 - + self.context = dilation * (kernel_size - 1) // 2 def freeze(self): for param in self.parameters(): param.requires_grad = False - def unfreeze(self): for param in self.parameters(): param.requires_grad = True - - def forward(self, x): x = self.conv1(x) @@ -152,7 +163,7 @@ def forward(self, x): if self.activation is not None: x = self.activation(x) - + if self.norm_after: x = self.bn1(x) @@ -160,7 +171,3 @@ def forward(self, x): x = self.dropout(x) return x - - - - diff --git a/hyperion/torch/layer_blocks/etdnn_blocks.py b/hyperion/torch/layer_blocks/etdnn_blocks.py index 1b371657..958c31ba 100644 --- a/hyperion/torch/layer_blocks/etdnn_blocks.py +++ b/hyperion/torch/layer_blocks/etdnn_blocks.py @@ -13,19 +13,24 @@ class ETDNNBlock(nn.Module): - - def __init__(self, in_channels, out_channels, - kernel_size, dilation=1, - activation={'name':'relu', 'inplace': True}, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=False): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + dilation=1, + activation={"name": "relu", "inplace": True}, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=False, + ): super().__init__() - self.activation1 = AF.create(activation) self.activation2 = AF.create(activation) - padding = int(dilation * (kernel_size - 1)/2) + padding = int(dilation * (kernel_size - 1) / 2) self.dropout_rate = dropout_rate self.dropout = None @@ -47,11 +52,15 @@ def __init__(self, in_channels, out_channels, self.norm_after = True bias = not self.norm_before - self.conv1 = Conv1d(in_channels, out_channels, bias=bias, - kernel_size=kernel_size, dilation=dilation, - padding=padding) + self.conv1 = Conv1d( + in_channels, + out_channels, + bias=bias, + kernel_size=kernel_size, + dilation=dilation, + padding=padding, + ) self.conv2 = Conv1d(out_channels, out_channels, bias=bias, kernel_size=1) - def forward(self, x): @@ -61,7 +70,7 @@ def forward(self, x): x = self.bn1(x) x = self.activation1(x) - + if self.norm_after: x = self.bn1(x) @@ -74,7 +83,7 @@ def forward(self, x): x = self.bn2(x) x = self.activation2(x) - + if self.norm_after: x = self.bn2(x) @@ -82,4 +91,3 @@ def forward(self, x): x = self.dropout2(x) return x - diff --git a/hyperion/torch/layer_blocks/fc_blocks.py b/hyperion/torch/layer_blocks/fc_blocks.py index 5fe31275..567474bf 100644 --- a/hyperion/torch/layer_blocks/fc_blocks.py +++ b/hyperion/torch/layer_blocks/fc_blocks.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# +# # import numpy as np @@ -11,9 +11,10 @@ from ..layers import ActivationFactory as AF + class FCBlock(nn.Module): - """ Fully connected block - + """Fully connected block + Attributes: in_feats: input feature dimension out_feats: output feature dimension @@ -23,11 +24,16 @@ class FCBlock(nn.Module): norm_before: if True normalization layer is applied before the activation function, if False after """ - def __init__(self, in_feats, out_feats, - activation={'name':'relu', 'inplace': True}, - dropout_rate=0, - norm_layer=None, - use_norm=True, norm_before=False): + def __init__( + self, + in_feats, + out_feats, + activation={"name": "relu", "inplace": True}, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=False, + ): super().__init__() @@ -50,13 +56,10 @@ def __init__(self, in_feats, out_feats, else: self.norm_after = True - self.linear = Linear(in_feats, out_feats, bias=(not self.norm_before)) - - + self.linear = Linear(in_feats, out_feats, bias=(not self.norm_before)) def forward(self, x): - """ Forward function - """ + """Forward function""" x = self.linear(x) if self.norm_before: x = self.bn1(x) @@ -72,10 +75,9 @@ def forward(self, x): return x - def forward_linear(self, x): - """ Forward function - without activation function + """Forward function + without activation function """ x = self.linear(x) @@ -83,7 +85,3 @@ def forward_linear(self, x): x = self.bn1(x) return x - - - - diff --git a/hyperion/torch/layer_blocks/res2net_blocks.py b/hyperion/torch/layer_blocks/res2net_blocks.py index 697f5c79..56804307 100644 --- a/hyperion/torch/layer_blocks/res2net_blocks.py +++ b/hyperion/torch/layer_blocks/res2net_blocks.py @@ -13,8 +13,16 @@ def _conv3x3(in_channels, out_channels, stride=1, groups=1, dilation=1, bias=False): """3x3 convolution with padding""" - return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, - padding=dilation, groups=groups, bias=bias, dilation=dilation) + return nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=bias, + dilation=dilation, + ) def _conv1x1(in_channels, out_channels, stride=1, bias=False): @@ -26,21 +34,33 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) if norm_before: return nn.Sequential( - _conv1x1(in_channels, out_channels, stride, bias=False), - norm_layer(out_channels)) - - return _conv1x1(in_channels, out_channels, stride, bias=True) - + _conv1x1(in_channels, out_channels, stride, bias=False), + norm_layer(out_channels), + ) + + return _conv1x1(in_channels, out_channels, stride, bias=True) + class Res2NetBasicBlock(nn.Module): expansion = 1 - def __init__(self, in_channels, channels, - activation={'name':'relu', 'inplace': True}, - stride=1, dropout_rate=0, - width_factor=1, scale=4, groups=1, - dilation=1, norm_layer=None, norm_before=True, - se_r=None, time_se=False, num_feats=None): + def __init__( + self, + in_channels, + channels, + activation={"name": "relu", "inplace": True}, + stride=1, + dropout_rate=0, + width_factor=1, + scale=4, + groups=1, + dilation=1, + norm_layer=None, + norm_before=True, + se_r=None, + time_se=False, + num_feats=None, + ): super().__init__() @@ -70,7 +90,9 @@ def __init__(self, in_channels, channels, proj1s = [] bn1s = [] for i in range(self.num_3x3): - conv1s.append(_conv3x3(width_in, width_mid, stride, groups, dilation, bias=bias)) + conv1s.append( + _conv3x3(width_in, width_mid, stride, groups, dilation, bias=bias) + ) bn1s.append(norm_layer(width_mid)) if self.has_proj1 and i < self.num_3x3 - 1: proj1s.append(_conv1x1(width_mid, width_in, bias=False)) @@ -90,8 +112,9 @@ def __init__(self, in_channels, channels, self.downsample = None if stride != 1 or in_channels != channels * self.expansion: - self.downsample = _make_downsample(in_channels, channels * self.expansion, - stride, norm_layer, norm_before) + self.downsample = _make_downsample( + in_channels, channels * self.expansion, stride, norm_layer, norm_before + ) self.dropout_rate = dropout_rate self.dropout = None @@ -103,20 +126,16 @@ def __init__(self, in_channels, channels, if se_r is not None: if time_se: - self.se_layer = TSEBlock2D( - channels, num_feats, se_r, activation) + self.se_layer = TSEBlock2D(channels, num_feats, se_r, activation) else: - self.se_layer = SEBlock2D( - channels, se_r, activation) + self.se_layer = SEBlock2D(channels, se_r, activation) else: self.se_layer = None - @property def out_channels(self): return self.channels - def forward(self, x): residual = x split_size = [self.width_in for i in range(self.scale - 1)] @@ -129,7 +148,7 @@ def forward(self, x): x_i = split_x[i] else: if self.has_proj1: - x_i = self.proj1s[i-1](x_i) + x_i = self.proj1s[i - 1](x_i) x_i = x_i + split_x[i] @@ -161,23 +180,33 @@ def forward(self, x): if not self.norm_before: x = self.bn2(x) - + if self.dropout_rate > 0: x = self.dropout(x) return x - class Res2NetBNBlock(nn.Module): expansion = 4 - def __init__(self, in_channels, channels, - activation={'name':'relu', 'inplace': True}, - stride=1, dropout_rate=0, - width_factor=1, scale=4, groups=1, - dilation=1, norm_layer=None, norm_before=True, - se_r=None, time_se=False, num_feats=None): + def __init__( + self, + in_channels, + channels, + activation={"name": "relu", "inplace": True}, + stride=1, + dropout_rate=0, + width_factor=1, + scale=4, + groups=1, + dilation=1, + norm_layer=None, + norm_before=True, + se_r=None, + time_se=False, + num_feats=None, + ): super().__init__() @@ -202,7 +231,7 @@ def __init__(self, in_channels, channels, self.num_3x3 = scale - 1 if stride > 1 and scale > 1: - self.pool = nn.AvgPool2d(kernel_size=3, stride = stride, padding=1) + self.pool = nn.AvgPool2d(kernel_size=3, stride=stride, padding=1) conv2s = [] bn2s = [] @@ -224,8 +253,9 @@ def __init__(self, in_channels, channels, self.downsample = None if stride != 1 or in_channels != channels * self.expansion: - self.downsample = _make_downsample(in_channels, channels * self.expansion, - stride, norm_layer, norm_before) + self.downsample = _make_downsample( + in_channels, channels * self.expansion, stride, norm_layer, norm_before + ) self.dropout_rate = dropout_rate self.dropout = None @@ -238,19 +268,17 @@ def __init__(self, in_channels, channels, if se_r is not None: if time_se: self.se_layer = TSEBlock2D( - channels * self.expansion, num_feats, se_r, activation) + channels * self.expansion, num_feats, se_r, activation + ) else: - self.se_layer = SEBlock2D( - channels * self.expansion, se_r, activation) + self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) else: self.se_layer = None - @property def out_channels(self): return self.channels * self.expansion - def forward(self, x): residual = x @@ -299,10 +327,8 @@ def forward(self, x): if not self.norm_before: x = self.bn3(x) - + if self.dropout_rate > 0: x = self.dropout(x) return x - - diff --git a/hyperion/torch/layer_blocks/resetdnn_blocks.py b/hyperion/torch/layer_blocks/resetdnn_blocks.py index 499e4e4c..9d849719 100644 --- a/hyperion/torch/layer_blocks/resetdnn_blocks.py +++ b/hyperion/torch/layer_blocks/resetdnn_blocks.py @@ -2,7 +2,7 @@ Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -# +# import numpy as np @@ -13,18 +13,31 @@ from ..layers import Dropout1d from .etdnn_blocks import ETDNNBlock -class ResETDNNBlock(ETDNNBlock): - - def __init__(self, num_channels, - kernel_size, dilation=1, - activation={'name':'relu', 'inplace': True}, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=False): - - super().__init__(num_channels, num_channels, - kernel_size, dilation, activation, dropout_rate, - norm_layer, use_norm, norm_before) +class ResETDNNBlock(ETDNNBlock): + def __init__( + self, + num_channels, + kernel_size, + dilation=1, + activation={"name": "relu", "inplace": True}, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=False, + ): + + super().__init__( + num_channels, + num_channels, + kernel_size, + dilation, + activation, + dropout_rate, + norm_layer, + use_norm, + norm_before, + ) def forward(self, x): @@ -35,7 +48,7 @@ def forward(self, x): x = self.bn1(x) x = self.activation1(x) - + if self.norm_after: x = self.bn1(x) @@ -49,7 +62,7 @@ def forward(self, x): x += residual x = self.activation2(x) - + if self.norm_after: x = self.bn2(x) @@ -57,4 +70,3 @@ def forward(self, x): x = self.dropout2(x) return x - diff --git a/hyperion/torch/layer_blocks/resnet_blocks.py b/hyperion/torch/layer_blocks/resnet_blocks.py index 291bd975..439a440a 100644 --- a/hyperion/torch/layer_blocks/resnet_blocks.py +++ b/hyperion/torch/layer_blocks/resnet_blocks.py @@ -9,10 +9,19 @@ from ..layers import ActivationFactory as AF + def _conv3x3(in_channels, out_channels, stride=1, groups=1, dilation=1, bias=False): """3x3 convolution with padding""" - return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, - padding=dilation, groups=groups, bias=bias, dilation=dilation) + return nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=dilation, + groups=groups, + bias=bias, + dilation=dilation, + ) def _conv1x1(in_channels, out_channels, stride=1, bias=False): @@ -24,15 +33,15 @@ def _make_downsample(in_channels, out_channels, stride, norm_layer, norm_before) if norm_before: return nn.Sequential( - _conv1x1(in_channels, out_channels, stride, bias=False), - norm_layer(out_channels)) - - return _conv1x1(in_channels, out_channels, stride, bias=True) - + _conv1x1(in_channels, out_channels, stride, bias=False), + norm_layer(out_channels), + ) + + return _conv1x1(in_channels, out_channels, stride, bias=True) class ResNetInputBlock(nn.Module): - """ Input block for ResNet architecture + """Input block for ResNet architecture Args: in_channels: input channels @@ -41,44 +50,57 @@ class ResNetInputBlock(nn.Module): stride: stride for conv activation: str/dict indicationg activation type and arguments norm_layer: norm_layer object constructor, if None it uses BatchNorm2d - norm_before: if True it applies the norm_layer before the activation, + norm_before: if True it applies the norm_layer before the activation, if False, after the activation do_maxpool: apply maxpooling 2x2 at the output """ - - def __init__(self, in_channels, out_channels, kernel_size=7, stride=2, - activation={'name':'relu', 'inplace': True}, - norm_layer=None, norm_before=True, do_maxpool=True): + + def __init__( + self, + in_channels, + out_channels, + kernel_size=7, + stride=2, + activation={"name": "relu", "inplace": True}, + norm_layer=None, + norm_before=True, + do_maxpool=True, + ): super().__init__() - padding = int((kernel_size - 1)/2) + padding = int((kernel_size - 1) / 2) if norm_layer is None: norm_layer = nn.BatchNorm2d bias = not norm_before - self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, - stride=stride, padding=padding, bias=bias) + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + bias=bias, + ) self.bn = norm_layer(out_channels) self.act = AF.create(activation) self.norm_before = norm_before self.do_maxpool = do_maxpool - self.context = int((kernel_size-1)/2) + self.context = int((kernel_size - 1) / 2) self.downsample_factor = stride if do_maxpool: self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) self.downsample_factor *= 2 - def forward(self, x): - + x = self.conv(x) if self.norm_before: x = self.bn(x) - + x = self.act(x) if not self.norm_before: x = self.bn(x) @@ -89,16 +111,23 @@ def forward(self, x): return x - class ResNetBasicBlock(nn.Module): expansion = 1 - #__constants__ = ['downsample'] - - def __init__(self, in_channels, channels, - activation={'name':'relu', 'inplace': True}, - stride=1, dropout_rate=0, groups=1, dilation=1, - norm_layer=None, norm_before=True): + # __constants__ = ['downsample'] + + def __init__( + self, + in_channels, + channels, + activation={"name": "relu", "inplace": True}, + stride=1, + dropout_rate=0, + groups=1, + dilation=1, + norm_layer=None, + norm_before=True, + ): super().__init__() if norm_layer is None: @@ -108,7 +137,9 @@ def __init__(self, in_channels, channels, self.channels = channels bias = not norm_before - self.conv1 = _conv3x3(in_channels, channels, stride, groups, dilation, bias=bias) + self.conv1 = _conv3x3( + in_channels, channels, stride, groups, dilation, bias=bias + ) self.bn1 = norm_layer(channels) self.act1 = AF.create(activation) self.conv2 = _conv3x3(channels, channels, groups=groups, bias=bias) @@ -119,8 +150,9 @@ def __init__(self, in_channels, channels, self.downsample = None if stride != 1 or in_channels != channels: - self.downsample = _make_downsample(in_channels, channels, - stride, norm_layer, norm_before) + self.downsample = _make_downsample( + in_channels, channels, stride, norm_layer, norm_before + ) self.dropout_rate = dropout_rate self.dropout = None @@ -130,12 +162,10 @@ def __init__(self, in_channels, channels, self.context = dilation + stride self.downsample_factor = stride - @property def out_channels(self): return self.channels - def forward(self, x): residual = x @@ -161,23 +191,29 @@ def forward(self, x): if not self.norm_before: x = self.bn2(x) - + if self.dropout_rate > 0: x = self.dropout(x) return x - - class ResNetBNBlock(nn.Module): expansion = 4 - #__constants__ = ['downsample'] - - def __init__(self, in_channels, channels, - activation={'name':'relu', 'inplace': True}, - stride=1, dropout_rate=0,groups=1, - dilation=1, norm_layer=None, norm_before=True): + # __constants__ = ['downsample'] + + def __init__( + self, + in_channels, + channels, + activation={"name": "relu", "inplace": True}, + stride=1, + dropout_rate=0, + groups=1, + dilation=1, + norm_layer=None, + norm_before=True, + ): super().__init__() @@ -204,8 +240,9 @@ def __init__(self, in_channels, channels, self.downsample = None if stride != 1 or in_channels != channels * self.expansion: - self.downsample = _make_downsample(in_channels, channels * self.expansion, - stride, norm_layer, norm_before) + self.downsample = _make_downsample( + in_channels, channels * self.expansion, stride, norm_layer, norm_before + ) self.dropout_rate = dropout_rate self.dropout = None @@ -215,12 +252,10 @@ def __init__(self, in_channels, channels, self.context = dilation self.downsample_factor = stride - @property def out_channels(self): return self.channels * self.expansion - def forward(self, x): residual = x @@ -250,7 +285,7 @@ def forward(self, x): if not self.norm_before: x = self.bn3(x) - + if self.dropout_rate > 0: x = self.dropout(x) @@ -258,7 +293,7 @@ def forward(self, x): class Interpolate(nn.Module): - def __init__(self, scale_factor, mode='nearest'): + def __init__(self, scale_factor, mode="nearest"): super().__init__() self.interp = nnf.interpolate self.scale_factor = scale_factor @@ -269,12 +304,16 @@ def forward(self, x): return x - class ResNetEndpointBlock(nn.Module): - - def __init__(self, in_channels, out_channels, scale, - activation={'name': 'relu', 'inplace': True}, - norm_layer=None, norm_before=True): + def __init__( + self, + in_channels, + out_channels, + scale, + activation={"name": "relu", "inplace": True}, + norm_layer=None, + norm_before=True, + ): super().__init__() @@ -293,8 +332,7 @@ def __init__(self, in_channels, out_channels, scale, self.scale = scale if self.scale > 1: - self.upsample = Interpolate(scale_factor=scale, mode='nearest') - + self.upsample = Interpolate(scale_factor=scale, mode="nearest") def forward(self, x): @@ -311,5 +349,3 @@ def forward(self, x): x = self.upsample(x) return x - - diff --git a/hyperion/torch/layer_blocks/se_blocks.py b/hyperion/torch/layer_blocks/se_blocks.py index 5f2567b8..3d33f7d4 100644 --- a/hyperion/torch/layer_blocks/se_blocks.py +++ b/hyperion/torch/layer_blocks/se_blocks.py @@ -11,36 +11,57 @@ class SEBlock2D(nn.Module): - """ From https://arxiv.org/abs/1709.01507 - """ - def __init__(self, num_channels, r=16, activation={'name':'relu', 'inplace': True}): + """From https://arxiv.org/abs/1709.01507""" + + def __init__( + self, num_channels, r=16, activation={"name": "relu", "inplace": True} + ): super().__init__() - self.conv1 = nn.Conv2d(num_channels, int(num_channels/r), kernel_size=1, bias=False) + self.conv1 = nn.Conv2d( + num_channels, int(num_channels / r), kernel_size=1, bias=False + ) self.act = AF.create(activation) - self.conv2 = nn.Conv2d(int(num_channels/r), num_channels, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d( + int(num_channels / r), num_channels, kernel_size=1, bias=False + ) self.sigmoid = nn.Sigmoid() - def forward(self, x): - z = torch.mean(x, dim=(2,3), keepdim=True) + z = torch.mean(x, dim=(2, 3), keepdim=True) scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) y = scale * x return y class TSEBlock2D(nn.Module): - """ From https://arxiv.org/abs/1709.01507 - Modified to do pooling only in time dimension + """From https://arxiv.org/abs/1709.01507 + Modified to do pooling only in time dimension """ - def __init__(self, num_channels, num_feats, r=16, activation={'name':'relu', 'inplace': True}): + + def __init__( + self, + num_channels, + num_feats, + r=16, + activation={"name": "relu", "inplace": True}, + ): super().__init__() - self.num_channels_1d = num_channels*num_feats - self.conv1 = nn.Conv2d(self.num_channels_1d, int(self.num_channels_1d/r), kernel_size=1, bias=False) + self.num_channels_1d = num_channels * num_feats + self.conv1 = nn.Conv2d( + self.num_channels_1d, + int(self.num_channels_1d / r), + kernel_size=1, + bias=False, + ) self.act = AF.create(activation) - self.conv2 = nn.Conv2d(int(self.num_channels_1d/r), self.num_channels_1d, kernel_size=1, bias=False) + self.conv2 = nn.Conv2d( + int(self.num_channels_1d / r), + self.num_channels_1d, + kernel_size=1, + bias=False, + ) self.sigmoid = nn.Sigmoid() - def forward(self, x): num_feats = x.shape[2] num_channels = x.shape[1] @@ -53,17 +74,23 @@ def forward(self, x): class SEBlock1d(nn.Module): - """ 1d Squeeze Excitation version of - https://arxiv.org/abs/1709.01507 + """1d Squeeze Excitation version of + https://arxiv.org/abs/1709.01507 """ - def __init__(self, num_channels, r=16, activation={'name':'relu', 'inplace': True}): + + def __init__( + self, num_channels, r=16, activation={"name": "relu", "inplace": True} + ): super().__init__() - self.conv1 = nn.Conv1d(num_channels, int(num_channels/r), kernel_size=1, bias=False) + self.conv1 = nn.Conv1d( + num_channels, int(num_channels / r), kernel_size=1, bias=False + ) self.act = AF.create(activation) - self.conv2 = nn.Conv1d(int(num_channels/r), num_channels, kernel_size=1, bias=False) + self.conv2 = nn.Conv1d( + int(num_channels / r), num_channels, kernel_size=1, bias=False + ) self.sigmoid = nn.Sigmoid() - def forward(self, x): z = torch.mean(x, dim=2, keepdim=True) scale = self.sigmoid(self.conv2(self.act(self.conv1(z)))) diff --git a/hyperion/torch/layer_blocks/seresnet_blocks.py b/hyperion/torch/layer_blocks/seresnet_blocks.py index 6ab1f992..a5a7fecd 100644 --- a/hyperion/torch/layer_blocks/seresnet_blocks.py +++ b/hyperion/torch/layer_blocks/seresnet_blocks.py @@ -13,25 +13,39 @@ class SEResNetBasicBlock(ResNetBasicBlock): - - def __init__(self, in_channels, channels, - activation={'name':'relu', 'inplace': True}, - stride=1, dropout_rate=0, groups=1, dilation=1, - norm_layer=None, norm_before=True, - se_r=16, time_se=False, num_feats=None): + def __init__( + self, + in_channels, + channels, + activation={"name": "relu", "inplace": True}, + stride=1, + dropout_rate=0, + groups=1, + dilation=1, + norm_layer=None, + norm_before=True, + se_r=16, + time_se=False, + num_feats=None, + ): super().__init__( - in_channels, channels, activation=activation, - stride=stride, dropout_rate=dropout_rate, - groups=groups, dilation=dilation, - norm_layer=norm_layer, norm_before=norm_before) + in_channels, + channels, + activation=activation, + stride=stride, + dropout_rate=dropout_rate, + groups=groups, + dilation=dilation, + norm_layer=norm_layer, + norm_before=norm_before, + ) if time_se: self.se_layer = TSEBlock2D(channels, num_feats, se_r, activation) else: self.se_layer = SEBlock2D(channels, se_r, activation) - def forward(self, x): residual = x @@ -58,35 +72,48 @@ def forward(self, x): if not self.norm_before: x = self.bn2(x) - + if self.dropout_rate > 0: x = self.dropout(x) return x - - class SEResNetBNBlock(ResNetBNBlock): - - def __init__(self, in_channels, channels, - activation={'name':'relu', 'inplace': True}, - stride=1, dropout_rate=0, groups=1, - dilation=1, norm_layer=None, norm_before=True, - se_r=16, time_se=False, num_feats=None): + def __init__( + self, + in_channels, + channels, + activation={"name": "relu", "inplace": True}, + stride=1, + dropout_rate=0, + groups=1, + dilation=1, + norm_layer=None, + norm_before=True, + se_r=16, + time_se=False, + num_feats=None, + ): super().__init__( - in_channels, channels, activation=activation, - stride=stride, dropout_rate=dropout_rate,groups=groups, - dilation=dilation, norm_layer=norm_layer, norm_before=norm_before) + in_channels, + channels, + activation=activation, + stride=stride, + dropout_rate=dropout_rate, + groups=groups, + dilation=dilation, + norm_layer=norm_layer, + norm_before=norm_before, + ) if time_se: self.se_layer = TSEBlock2D( - channels * self.expansion, num_feats, se_r, activation) + channels * self.expansion, num_feats, se_r, activation + ) else: - self.se_layer = SEBlock2D( - channels * self.expansion, se_r, activation) - + self.se_layer = SEBlock2D(channels * self.expansion, se_r, activation) def forward(self, x): residual = x @@ -118,10 +145,8 @@ def forward(self, x): if not self.norm_before: x = self.bn3(x) - + if self.dropout_rate > 0: x = self.dropout(x) return x - - diff --git a/hyperion/torch/layer_blocks/tdnn_blocks.py b/hyperion/torch/layer_blocks/tdnn_blocks.py index 53af1b05..8fcbb056 100644 --- a/hyperion/torch/layer_blocks/tdnn_blocks.py +++ b/hyperion/torch/layer_blocks/tdnn_blocks.py @@ -9,18 +9,25 @@ from ..layers import ActivationFactory as AF from ..layers import Dropout1d -class TDNNBlock(nn.Module): - def __init__(self, in_channels, out_channels, - kernel_size, dilation=1, - activation={'name':'relu', 'inplace': True}, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=False): +class TDNNBlock(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + dilation=1, + activation={"name": "relu", "inplace": True}, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=False, + ): super().__init__() self.activation = AF.create(activation) - padding = int(dilation * (kernel_size -1)/2) + padding = int(dilation * (kernel_size - 1) / 2) self.dropout_rate = dropout_rate self.dropout = None @@ -33,29 +40,29 @@ def __init__(self, in_channels, out_channels, if norm_layer is None: norm_layer = BatchNorm1d - self.bn1 = norm_layer(out_channels) + self.bn1 = norm_layer(out_channels) if norm_before: self.norm_before = True else: self.norm_after = True - self.conv1 = Conv1d(in_channels, out_channels, - bias=(not self.norm_before), - kernel_size=kernel_size, dilation=dilation, - padding=padding) - + self.conv1 = Conv1d( + in_channels, + out_channels, + bias=(not self.norm_before), + kernel_size=kernel_size, + dilation=dilation, + padding=padding, + ) def freeze(self): for param in self.parameters(): param.requires_grad = False - def unfreeze(self): for param in self.parameters(): param.requires_grad = True - - def forward(self, x): x = self.conv1(x) @@ -65,7 +72,7 @@ def forward(self, x): if self.activation is not None: x = self.activation(x) - + if self.norm_after: x = self.bn1(x) @@ -73,7 +80,3 @@ def forward(self, x): x = self.dropout(x) return x - - - - diff --git a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py index 3882cd0f..c841a056 100644 --- a/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py +++ b/hyperion/torch/layer_blocks/transformer_conv2d_subsampler.py @@ -6,6 +6,7 @@ import torch import torch.nn as nn + class TransformerConv2dSubsampler(nn.Module): """Convolutional 2D subsampling (to 1/4 length) Tor transformer @@ -21,15 +22,14 @@ def __init__(self, in_feats, out_feats, hid_act, pos_enc, time_dim=1): super().__init__() self.time_dim = time_dim self.conv = nn.Sequential( - nn.Conv2d(1, out_feats, 3, 2, padding=(0,1)), + nn.Conv2d(1, out_feats, 3, 2, padding=(0, 1)), + hid_act, + nn.Conv2d(out_feats, out_feats, 3, 2, padding=(0, 1)), hid_act, - nn.Conv2d(out_feats, out_feats, 3, 2, padding=(0,1)), - hid_act ) self.out = nn.Sequential( - nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), out_feats), - pos_enc) - + nn.Linear(out_feats * (((in_feats - 1) // 2 - 1) // 2), out_feats), pos_enc + ) def forward(self, x, mask): """Forward function. @@ -43,12 +43,12 @@ def forward(self, x, mask): Tensor with subsampled mask """ if self.time_dim == 1: - x = x.transpose(1,2) + x = x.transpose(1, 2) x = x.unsqueeze(1) # (b, c, f, t) x = self.conv(x) b, c, f, t = x.size() - x = self.out(x.contiguous().view(b, c * f, t).transpose(1,2)) + x = self.out(x.contiguous().view(b, c * f, t).transpose(1, 2)) if mask is None: return x, None return x, mask[:, :, :-2:2][:, :, :-2:2] diff --git a/hyperion/torch/layer_blocks/transformer_encoder_v1.py b/hyperion/torch/layer_blocks/transformer_encoder_v1.py index b294db8a..c8eaaa1b 100644 --- a/hyperion/torch/layer_blocks/transformer_encoder_v1.py +++ b/hyperion/torch/layer_blocks/transformer_encoder_v1.py @@ -9,6 +9,7 @@ from ..layers.attention import * from .transformer_feedforward import * + class TransformerEncoderBlockV1(nn.Module): """Building block for transformer encoder. @@ -19,7 +20,7 @@ class TransformerEncoderBlockV1(nn.Module): feed_forward: position-wise feed-forward nn.Module or string in ['linear', 'conv1dx2', 'conv1d-linear'] d_ff: dimension of middle layer in feed_forward block ff_kernel_size: kernel size for convolutional versions of ff block - ff_act: ff block hidden activation + ff_act: ff block hidden activation ff_dropout_rate: dropout rate for ff block att_context: maximum context range for local attention att_dropout_rate: dropout rate for attention block @@ -33,25 +34,42 @@ class TransformerEncoderBlockV1(nn.Module): """ - def __init__(self, num_feats, self_attn, num_heads, - feed_forward, d_ff, ff_kernel_size, - ff_act='relu6', ff_dropout_rate=0, - att_context=25, att_dropout_rate=0, - rel_pos_enc=False, causal_pos_enc=False, - norm_before=True, concat_after=False): + def __init__( + self, + num_feats, + self_attn, + num_heads, + feed_forward, + d_ff, + ff_kernel_size, + ff_act="relu6", + ff_dropout_rate=0, + att_context=25, + att_dropout_rate=0, + rel_pos_enc=False, + causal_pos_enc=False, + norm_before=True, + concat_after=False, + ): super().__init__() if isinstance(self_attn, str): self.self_attn = self._make_att( - self_attn, num_feats, num_heads, att_context, att_dropout_rate, - rel_pos_enc, causal_pos_enc) + self_attn, + num_feats, + num_heads, + att_context, + att_dropout_rate, + rel_pos_enc, + causal_pos_enc, + ) else: self.self_attn = self_attn if isinstance(feed_forward, str): self.feed_forward = self._make_ff( - feed_forward, num_feats, d_ff, ff_kernel_size, - ff_act, ff_dropout_rate) + feed_forward, num_feats, d_ff, ff_kernel_size, ff_act, ff_dropout_rate + ) else: self.feed_forward = feed_forward @@ -66,10 +84,16 @@ def __init__(self, num_feats, self_attn, num_heads, if self.concat_after: self.concat_linear = nn.Linear(num_feats + num_feats, num_feats) - @staticmethod - def _make_att(att_type, num_feats, num_heads, context, - dropout_rate, rel_pos_enc, causal_pos_enc): + def _make_att( + att_type, + num_feats, + num_heads, + context, + dropout_rate, + rel_pos_enc, + causal_pos_enc, + ): """Creates multihead attention block from att_type string Args: @@ -84,31 +108,52 @@ def _make_att(att_type, num_feats, num_heads, context, Returns: Attention nn.Module """ - + assert num_feats % num_heads == 0 d_k = num_feats // num_heads - if att_type == 'scaled-dot-prod-v1': + if att_type == "scaled-dot-prod-v1": if rel_pos_enc: return ScaledDotProdAttRelPosEncV1( - num_feats, num_feats, num_heads, d_k, d_k, - causal_pos_enc, dropout_rate, time_dim=1) + num_feats, + num_feats, + num_heads, + d_k, + d_k, + causal_pos_enc, + dropout_rate, + time_dim=1, + ) return ScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, - dropout_rate, time_dim=1) + num_feats, num_feats, num_heads, d_k, d_k, dropout_rate, time_dim=1 + ) - if att_type == 'local-scaled-dot-prod-v1': + if att_type == "local-scaled-dot-prod-v1": if rel_pos_enc: return LocalScaledDotProdAttRelPosEncV1( - num_feats, num_feats, num_heads, d_k, d_k, - context, causal_pos_enc, dropout_rate, time_dim=1) + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + causal_pos_enc, + dropout_rate, + time_dim=1, + ) return LocalScaledDotProdAttV1( - num_feats, num_feats, num_heads, d_k, d_k, - context, dropout_rate, time_dim=1) + num_feats, + num_feats, + num_heads, + d_k, + d_k, + context, + dropout_rate, + time_dim=1, + ) - @staticmethod def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rate): """Creates position-wise feed forward block from ff_type string @@ -120,24 +165,25 @@ def _make_ff(ff_type, num_feats, hid_feats, kernel_size, activation, dropout_rat kernel_size: kernel size for convolutional versions of ff block dropout_rate: dropout rate for ff block activation: activation function for ff block - + Returns: - Position-wise feed-forward nn.Module + Position-wise feed-forward nn.Module """ - if ff_type == 'linear': + if ff_type == "linear": return PositionwiseFeedForward( - num_feats, hid_feats, activation, dropout_rate, time_dim=1) + num_feats, hid_feats, activation, dropout_rate, time_dim=1 + ) - if ff_type == 'conv1dx2': + if ff_type == "conv1dx2": return Conv1dx2( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1) + num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 + ) - if ff_type == 'conv1d-linear': + if ff_type == "conv1d-linear": return Conv1dLinear( - num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1) - - + num_feats, hid_feats, kernel_size, activation, dropout_rate, time_dim=1 + ) def forward(self, x, pos_emb=None, mask=None): """Forward pass function diff --git a/hyperion/torch/layer_blocks/transformer_feedforward.py b/hyperion/torch/layer_blocks/transformer_feedforward.py index 5064adbf..900500ff 100644 --- a/hyperion/torch/layer_blocks/transformer_feedforward.py +++ b/hyperion/torch/layer_blocks/transformer_feedforward.py @@ -21,7 +21,9 @@ class PositionwiseFeedForward(nn.Module): time_dim: time dimension in the input tensor """ - def __init__(self, num_feats, hid_feats, activation='relu6', dropout_rate=0, time_dim=1): + def __init__( + self, num_feats, hid_feats, activation="relu6", dropout_rate=0, time_dim=1 + ): super().__init__() self.w_1 = nn.Linear(num_feats, hid_feats) self.w_2 = nn.Linear(hid_feats, num_feats) @@ -31,7 +33,6 @@ def __init__(self, num_feats, hid_feats, activation='relu6', dropout_rate=0, tim if self.dropout_rate > 0: self.dropout = torch.nn.Dropout(dropout_rate) - def forward(self, x): """Forward function. @@ -55,10 +56,9 @@ def forward(self, x): return x - class Conv1dx2(nn.Module): """Two layer Conv1d for transformer feed-forward block - + Introduced in `FastSpeech: Fast, Robust and Controllable Text to Speech`_. .. _`FastSpeech: Fast, Robust and Controllable Text to Speech`: https://arxiv.org/pdf/1905.09263.pdf @@ -72,20 +72,31 @@ class Conv1dx2(nn.Module): time_dim: indicates what is the time dimension in the input tensor. """ - def __init__(self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1): + def __init__( + self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1 + ): super().__init__() - self.w_1 = nn.Conv1d(num_channels, hid_channels, kernel_size, - stride=1, padding=(kernel_size - 1) // 2) - self.w_2 = nn.Conv1d(hid_channels, num_channels, kernel_size, - stride=1, padding=(kernel_size - 1) // 2) + self.w_1 = nn.Conv1d( + num_channels, + hid_channels, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) + self.w_2 = nn.Conv1d( + hid_channels, + num_channels, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) self.dropout_rate = dropout_rate self.time_dim = time_dim self.activation = AF.create(activation) if self.dropout_rate > 0: self.dropout = Dropout1d(dropout_rate) - def forward(self, x): """Calculates forward propagation. Args: @@ -118,12 +129,20 @@ class Conv1dLinear(nn.Module): activation: activation function for hidden layers dropout_rate: dropout rate time_dim: indicates what is the time dimension in the input tensor. - + """ - def __init__(self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1): + + def __init__( + self, num_channels, hid_channels, kernel_size, dropout_rate=0, time_dim=-1 + ): super().__init__() - self.w_1 = nn.Conv1d(num_channels, hid_channels, kernel_size, - stride=1, padding=(kernel_size - 1) // 2) + self.w_1 = nn.Conv1d( + num_channels, + hid_channels, + kernel_size, + stride=1, + padding=(kernel_size - 1) // 2, + ) self.w_2 = nn.Conv1d(hid_channels, num_channels, 1) self.dropout_rate = dropout_rate @@ -132,7 +151,6 @@ def __init__(self, num_channels, hid_channels, kernel_size, dropout_rate=0, time if self.dropout_rate > 0: self.dropout = Dropout1d(dropout_rate) - def forward(self, x): """Calculates forward propagation. Args: @@ -153,5 +171,3 @@ def forward(self, x): x.transpose(-1, self.time_dim) return x - - diff --git a/hyperion/torch/layers/attention.py b/hyperion/torch/layers/attention.py index 7efc6ecd..7b4f5c06 100644 --- a/hyperion/torch/layers/attention.py +++ b/hyperion/torch/layers/attention.py @@ -5,10 +5,11 @@ import math -#import numpy +# import numpy import torch from torch import nn + class ScaledDotProdAttV1(nn.Module): """Scaled dot product multihead attention layer @@ -19,11 +20,13 @@ class ScaledDotProdAttV1(nn.Module): d_k: key/query projection dimension d_v: value projection dimension dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input + time_dim: time dimension in the input, default=1 meaning input dimensions are (batch, time, in_feats) """ - def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, dropout_rate=0, time_dim=1): + def __init__( + self, in_feats, out_feats, num_heads, d_k, d_v, dropout_rate=0, time_dim=1 + ): super().__init__() # We assume d_v always equals d_k self.d_v = d_v @@ -31,15 +34,14 @@ def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, dropout_rate=0, tim self.num_heads = num_heads self.dropout_rate = dropout_rate self.time_dim = time_dim - self.linear_q = nn.Linear(in_feats, num_heads*d_k) - self.linear_k = nn.Linear(in_feats, num_heads*d_k) - self.linear_v = nn.Linear(in_feats, num_heads*d_v) - self.linear_out = nn.Linear(num_heads*d_v, out_feats) + self.linear_q = nn.Linear(in_feats, num_heads * d_k) + self.linear_k = nn.Linear(in_feats, num_heads * d_k) + self.linear_v = nn.Linear(in_feats, num_heads * d_v) + self.linear_out = nn.Linear(num_heads * d_v, out_feats) self.attn = None if self.dropout_rate > 0: self.dropout = nn.Dropout(p=dropout_rate) - @property def in_feats(self): return self.linear_v.in_features @@ -52,12 +54,18 @@ def __repr__(self): return self.__str__() def __str__(self): - s = '{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, dropout_rate={}, time_dim={})'.format( - self.__class__.__name__, self.in_feats, self.out_feats, self.num_heads, - self.d_k, self.d_v, self.dropout_rate, self.time_dim) + s = "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, dropout_rate={}, time_dim={})".format( + self.__class__.__name__, + self.in_feats, + self.out_feats, + self.num_heads, + self.d_k, + self.d_v, + self.dropout_rate, + self.time_dim, + ) return s - def _compute_qkv(self, query, key, value): batch_size = value.size(0) if self.time_dim != 1: @@ -74,10 +82,11 @@ def _compute_qkv(self, query, key, value): return q, k, v - def _compute_softmax(self, scores, mask): if mask is not None: - mask = mask.unsqueeze(1).eq(0) # (batch, 1, time1, time2) or (batch, 1, time) + mask = mask.unsqueeze(1).eq( + 0 + ) # (batch, 1, time1, time2) or (batch, 1, time) if scores.dtype == torch.half: min_value = -65504 else: @@ -85,17 +94,18 @@ def _compute_softmax(self, scores, mask): if mask.dim() == 4: scores = scores.masked_fill(mask, min_value) - return torch.softmax(scores, dim=-1).masked_fill(mask, 0.0) # (batch, head, time1, time2) + return torch.softmax(scores, dim=-1).masked_fill( + mask, 0.0 + ) # (batch, head, time1, time2) else: mask1 = mask.unsqueze(2) mask2 = mask.unsqueeze(-1) scores = scores.masked_fill(mask1, min_value) scores = scores.masked_fill(mask2, min_value) - return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) + return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) return torch.softmax(scores, dim=-1) # (batch, head, time1, time2) - def _apply_attn(self, v): batch_size = v.size(0) if self.dropout_rate > 0: @@ -104,9 +114,12 @@ def _apply_attn(self, v): p_attn = self.attn x = torch.matmul(p_attn, v) # (batch, head, time1, d_k) - x = x.transpose(1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) # (batch, time1, d_model) + x = ( + x.transpose(1, 2) + .contiguous() + .view(batch_size, -1, self.num_heads * self.d_v) + ) # (batch, time1, d_model) return self.linear_out(x) # (batch, time1, d_model) - ___compute_softmax = _compute_softmax ___apply_attn = _apply_attn @@ -115,27 +128,27 @@ def forward(self, query, key, value, mask=None): """Computes 'Scaled Dot Product Attention'. Args: - query: query with size=(batch, time1, in_feats), + query: query with size=(batch, time1, in_feats), where time1 is the output time dimension key: key with size=(batch, time2, in_feats) where time1 is the input time dimension value: value with size=(batch, time2, in_feats) - mask: optional mask with size=(batch, time1, time2), + mask: optional mask with size=(batch, time1, time2), to zero attention between some time steps or size=(batch, time) to make time1=time2 Returns: Attention weigthed average of the value with size=(batch, time1, out_feats) """ q, k, v = self._compute_qkv(query, key, value) - scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # (batch, head, time1, time2) + scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt( + self.d_k + ) # (batch, head, time1, time2) self.attn = self.___compute_softmax(scores, mask) return self.___apply_attn(v) - - class LocalScaledDotProdAttV1(ScaledDotProdAttV1): - """Local Scaled dot product multihead attention layer + """Local Scaled dot product multihead attention layer It calculates self-attention between time steps within a window of 'context' frames. @@ -147,32 +160,47 @@ class LocalScaledDotProdAttV1(ScaledDotProdAttV1): d_v: value projection dimension context: maximum attention temporal context. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input + time_dim: time dimension in the input, default=1 meaning input dimensions are (batch, time, in_feats) """ - def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, - context=25, dropout_rate=0, time_dim=1): + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context=25, + dropout_rate=0, + time_dim=1, + ): """Construct an MultiHeadedAttention object.""" super().__init__( - in_feats, out_feats, num_heads, d_k, d_v, - dropout_rate, time_dim) + in_feats, out_feats, num_heads, d_k, d_v, dropout_rate, time_dim + ) self.context = context - def __repr__(self): return self.__str__() - def __str__(self): - s = ('{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, ' - 'context={}, dropout_rate={}, time_dim={})'.format( - self.__class__.__name__, self.in_feats, self.out_feats, self.num_heads, - self.d_k, self.d_v, self.context, self.dropout_rate, self.time_dim)) + s = ( + "{}(in_feats={}, out_feats={}, num_heads={}, d_k={}, d_v={}, " + "context={}, dropout_rate={}, time_dim={})".format( + self.__class__.__name__, + self.in_feats, + self.out_feats, + self.num_heads, + self.d_k, + self.d_v, + self.context, + self.dropout_rate, + self.time_dim, + ) + ) return s - - def _compute_qkv00(self, query, key, value): batch_size = query.size(0) t1 = query.size(self.time_dim) @@ -183,12 +211,14 @@ def _compute_qkv00(self, query, key, value): value = value.transpose(1, self.time_dim) context_k = self.context - num_blocks = math.ceil(t2 / context_k) #(t2 + context_k//2)//context_k + num_blocks = math.ceil(t2 / context_k) # (t2 + context_k//2)//context_k context_q = math.ceil(t1 / num_blocks) - num_blocks_q = math.ceil(t1 / context_q) #(t1 + context_q//2)//context_q - assert num_blocks == num_blocks_q, ( - 'num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}'.format( - num_blocks, num_blocks_q, context_k, context_q, t1, t2)) + num_blocks_q = math.ceil(t1 / context_q) # (t1 + context_q//2)//context_q + assert ( + num_blocks == num_blocks_q + ), "num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}".format( + num_blocks, num_blocks_q, context_k, context_q, t1, t2 + ) pad1 = context_q * num_blocks - t1 pad2 = context_k * num_blocks - t2 # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) @@ -200,14 +230,12 @@ def _compute_qkv00(self, query, key, value): value = nn.functional.pad(value, (0, 0, 0, pad2)) # print('2',query.shape,key.shape,value.shape) - q0 = self.linear_q(query) # (batch, time1, head*d_k) - k0 = self.linear_k(key) # (batch, time2, head*d_k) - v0 = self.linear_v(value) # (batch, time2, head*d_v) + q0 = self.linear_q(query) # (batch, time1, head*d_k) + k0 = self.linear_k(key) # (batch, time2, head*d_k) + v0 = self.linear_v(value) # (batch, time2, head*d_v) return q0, k0, v0, context_q, context_k, num_blocks - - def _compute_qkv0(self, query, key, value): batch_size = query.size(0) t1 = query.size(self.time_dim) @@ -218,7 +246,7 @@ def _compute_qkv0(self, query, key, value): value = value.transpose(1, self.time_dim) num_blocks = round(t2 / self.context) - #print(num_blocks, t2, self.context) + # print(num_blocks, t2, self.context) context_k = math.ceil(t2 / num_blocks) context_q = math.ceil(t1 / num_blocks) pad1 = context_q * num_blocks - t1 @@ -232,39 +260,42 @@ def _compute_qkv0(self, query, key, value): value = nn.functional.pad(value, (0, 0, 0, pad2)) # print('2',query.shape,key.shape,value.shape) - q0 = self.linear_q(query) # (batch, time1, head*d_k) - k0 = self.linear_k(key) # (batch, time2, head*d_k) - v0 = self.linear_v(value) # (batch, time2, head*d_v) + q0 = self.linear_q(query) # (batch, time1, head*d_k) + k0 = self.linear_k(key) # (batch, time2, head*d_k) + v0 = self.linear_v(value) # (batch, time2, head*d_v) return q0, k0, v0, context_q, context_k, num_blocks - - def _compute_scores(self, q0, k0, num_blocks, - context_q, context_k, q_left_shift, k_left_shift): + def _compute_scores( + self, q0, k0, num_blocks, context_q, context_k, q_left_shift, k_left_shift + ): batch_size = q0.size(0) if q_left_shift > 0: # we are computing the shifted block-diag score matrix q_right_shift = context_q - q_left_shift k_right_shift = context_k - k_left_shift - q0 = q0[:,q_left_shift:-q_right_shift] - k0 = k0[:,k_left_shift:-k_right_shift] - - q = q0.view( - batch_size, -1, self.num_heads, self.d_k).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks, -1, self.d_k) - # (batch, head, blocks, time1, d_k) - k = k0.view( - batch_size, -1, self.num_heads, self.d_k).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks, -1, self.d_k) - # (batch, head, blocks time2, d_k) + q0 = q0[:, q_left_shift:-q_right_shift] + k0 = k0[:, k_left_shift:-k_right_shift] + + q = ( + q0.view(batch_size, -1, self.num_heads, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) + ) + # (batch, head, blocks, time1, d_k) + k = ( + k0.view(batch_size, -1, self.num_heads, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) + ) + # (batch, head, blocks time2, d_k) # print('4',q.shape,k.shape) return torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - @staticmethod def _softmax(scores1, scores2, shift1, shift2, t1, t2): """Computes softmax for block diagonal attention maps @@ -274,9 +305,9 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): with size=(batch, heads, blocks, t1, t2) scores2: attention scores from a shifted block-diagonal score matrix with size=(batch, heads, blocks-1, t1, t2) - shift1: shift of diagonal blocks of scores2 wrt scores1 in time steps in the + shift1: shift of diagonal blocks of scores2 wrt scores1 in time steps in the time dimension 1 - shift2: shift of diagonal blocks of scores2 wrt scores1 in time steps in the + shift2: shift of diagonal blocks of scores2 wrt scores1 in time steps in the time dimension 2, with self-attention shift1=shift2 t1: length of time dimension 1 (output time dimension) t2: length of time dimension 2 (input time dimension), with self-att t1=t2. @@ -286,7 +317,7 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): with size=(batch, heads, blocks, t1, t2) probs2: posterior attention scores for a shifted block-diagonal att. matrix with size=(batch, heads, blocks-1, t1, t2) - + """ if scores2.dtype == torch.half: min_val = -65504 @@ -300,30 +331,30 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): context2 = scores1.size(4) # set elements in scores2 that overlap with elements in scores1 to -inf - scores2[:,:,:,:context1-shift1,:context2-shift2] = min_val - scores2[:,:,:,shift1:,shift2:] = min_val + scores2[:, :, :, : context1 - shift1, : context2 - shift2] = min_val + scores2[:, :, :, shift1:, shift2:] = min_val - #set the padding time steps that we had to add to make integer block-number to -inf + # set the padding time steps that we had to add to make integer block-number to -inf # in scores1 # print('softmax', scores1.shape, scores2.shape, shift1, shift2, t1, t2, # scores1.size(2)*scores1.size(3) - t1, scores2.size(2)*scores2.size(3) + shift1 - t1, # scores1.size(2)*scores1.size(4) - t2, scores2.size(2)*scores2.size(4) + shift2 - t2) - dt1 = max(0, scores1.size(2)*scores1.size(3) - t1) + dt1 = max(0, scores1.size(2) * scores1.size(3) - t1) if dt1 > 0: - scores1[:,:,-1,-dt1:,:] = min_val - dt1 = max(0, scores2.size(2)*scores2.size(3) + shift1 - t1) + scores1[:, :, -1, -dt1:, :] = min_val + dt1 = max(0, scores2.size(2) * scores2.size(3) + shift1 - t1) # in scores2 if dt1 > 0: - scores2[:,:,-1,-dt1:,:] = min_val + scores2[:, :, -1, -dt1:, :] = min_val - dt2 = max(0, scores1.size(2)*scores1.size(4) - t2) + dt2 = max(0, scores1.size(2) * scores1.size(4) - t2) if dt2 > 0: - scores1[:,:,-1,:,-dt2:] = min_val - dt2 = max(0, scores2.size(2)*scores2.size(4) + shift2 - t2) + scores1[:, :, -1, :, -dt2:] = min_val + dt2 = max(0, scores2.size(2) * scores2.size(4) + shift2 - t2) # in scores2 if dt2 > 0: - scores2[:,:,-1,:,-dt2:] = min_val + scores2[:, :, -1, :, -dt2:] = min_val # dt1 = max(0, scores1.size(2)*scores1.size(3) - t1) # dt2 = max(0, scores1.size(2)*scores1.size(4) - t2) @@ -334,30 +365,35 @@ def _softmax(scores1, scores2, shift1, shift2, t1, t2): # dt2 = max(0, dt2 - shift2) # if dt1 > 0 or dt2 > 0: # scores2[:,:,-1,-dt1:,-dt2:] = min_val - - #flatten blocks and time1 dimensions + + # flatten blocks and time1 dimensions scores1 = scores1.view(batch_size, num_heads, -1, context2) scores2 = scores2.view(batch_size, num_heads, -1, context2) - #print('aa', scores1.shape, scores2.shape) - #pad scores2 to have the same size as scores1 - scores2 = nn.functional.pad(scores2, (0, 0, shift1, context1-shift1), - mode='constant', value=min_val) - #print('bb', scores1.shape, scores2.shape) - #concat scores1, scores2 and do softmax in time2 dimension + # print('aa', scores1.shape, scores2.shape) + # pad scores2 to have the same size as scores1 + scores2 = nn.functional.pad( + scores2, (0, 0, shift1, context1 - shift1), mode="constant", value=min_val + ) + # print('bb', scores1.shape, scores2.shape) + # concat scores1, scores2 and do softmax in time2 dimension # (batch, heads, blocks*time1, 2*time2) probs = torch.softmax(torch.cat((scores1, scores2), dim=-1), dim=-1) - - #now we separate back probs into probs1, and probs2 - #probs1 - probs1 = probs[:,:,:,:context2].contiguous().view( - batch_size, num_heads, num_blocks, -1, context2) - #probs2 - probs2 = probs[:,:,shift1:-(context1-shift1),context2:].contiguous().view( - batch_size, num_heads, num_blocks-1, -1, context2) - - return probs1, probs2 - + # now we separate back probs into probs1, and probs2 + # probs1 + probs1 = ( + probs[:, :, :, :context2] + .contiguous() + .view(batch_size, num_heads, num_blocks, -1, context2) + ) + # probs2 + probs2 = ( + probs[:, :, shift1 : -(context1 - shift1), context2:] + .contiguous() + .view(batch_size, num_heads, num_blocks - 1, -1, context2) + ) + + return probs1, probs2 def _mask_scores_1d(self, scores, mask, shift1, shift2): if scores.dtype == torch.half: @@ -371,7 +407,8 @@ def _mask_scores_1d(self, scores, mask, shift1, shift2): context2 = scores.size(4) mask_blocks = torch.ones_like(scores, dtype=mask.dtype) mask_single_block = torch.zeros( - (batch_size, context1, context2), dtype=mask.dtype) + (batch_size, context1, context2), dtype=mask.dtype + ) t1_start = shift1 t2_start = shift2 @@ -379,14 +416,13 @@ def _mask_scores_1d(self, scores, mask, shift1, shift2): t1_end = t1_start + context1 t2_end = t2_start + context2 mask_single_block.fill_(False) - mask_single_block.masked_fill_(mask[:,0,t1_start:t1_end], True) - mask_single_block.masked_fill_(mask[:,:,t2_start:t2_end], True) - mask_blocks[:,block] = mask_single_block + mask_single_block.masked_fill_(mask[:, 0, t1_start:t1_end], True) + mask_single_block.masked_fill_(mask[:, :, t2_start:t2_end], True) + mask_blocks[:, block] = mask_single_block t1_start += context1 t2_start += context2 return scores.masked_fill(mask_blocks, min_value) - def _mask_scores_2d(self, scores, mask, shift1, shift2): if scores.dtype == torch.half: @@ -404,42 +440,48 @@ def _mask_scores_2d(self, scores, mask, shift1, shift2): for block in range(num_blocks): t1_end = min(t1_start + context1, mask.size(1)) t2_end = min(t2_start + context2, mask.size(2)) - mask_blocks[:,block,:(t1_end-t1_start),:(t2_end-t2_start)] = mask[ - :,t1_start:t1_end,t2_start:t2_end] + mask_blocks[:, block, : (t1_end - t1_start), : (t2_end - t2_start)] = mask[ + :, t1_start:t1_end, t2_start:t2_end + ] t1_start += context1 t2_start += context2 return scores.masked_fill(mask_blocks, min_value) - - def _compute_softmax(self, scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2): + def _compute_softmax( + self, scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 + ): if mask is not None: # put to -inf scores in points where mask==0 if mask.dim() == 4: # case when mask is 2d matrix per batch element - mask = mask.eq(0) # (batch, time1, time2) + mask = mask.eq(0) # (batch, time1, time2) # first, we mask block diagonal blocks scores1 = self._mask_scores_2d(scores1, mask, 0, 0) # second, we mask shifted block diagonal blocks - scores2 = self._mask_scores_2d(scores2, mask, q_left_shift, k_left_shift) + scores2 = self._mask_scores_2d( + scores2, mask, q_left_shift, k_left_shift + ) else: # case when mask is 1d vector per batch element, # meaning that time1 and time2 are the same, so mask is symmetric mask = nn.functional.pad(mask, (0, pad2)) - mask = mask.squeeze(1).eq(0) # (batch, 1, time) + mask = mask.squeeze(1).eq(0) # (batch, 1, time) # first, we mask block diagonal blocks scores1 = self._mask_scores_1d(scores1, mask, 0, 0) # second, we mask shifted block diagonal blocks - scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, k_left_shift) + scores2 = self._mask_scores_1d( + scores2, mask, q_left_shift, k_left_shift + ) self.attn1, self.attn2 = self._softmax( - scores1, scores2, q_left_shift, k_left_shift, t1, t2) - + scores1, scores2, q_left_shift, k_left_shift, t1, t2 + ) def _apply_attn(self, v0, t1): if self.dropout_rate > 0: @@ -458,48 +500,60 @@ def _apply_attn(self, v0, t1): q_right_shift = context_q - q_left_shift k_right_shift = context_k - k_left_shift - v = v0.view( - batch_size, -1, self.num_heads, self.d_v).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks, -1, self.d_k) + v = ( + v0.view(batch_size, -1, self.num_heads, self.d_v) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) + ) # (batch, heads, blocks, time2, d_v) # print('8',p_attn1.shape,p_attn2.shape, v.shape) # (batch, head, blocks, time1, time2) x (batch, head, blocks, time2, d_v) x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) # print('9',x.shape) - x = x.view(batch_size, self.num_heads, -1, self.d_k).transpose( - 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) + x = ( + x.view(batch_size, self.num_heads, -1, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, -1, self.num_heads * self.d_v) + ) # (batch, time1, d_model) # print('10',x.shape) - v = v0[:,k_left_shift:-k_right_shift].view( - batch_size, -1, self.num_heads, self.d_v).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks-1, -1, self.d_v) + v = ( + v0[:, k_left_shift:-k_right_shift] + .view(batch_size, -1, self.num_heads, self.d_v) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_v) + ) # (batch, blocks-1, head, time2, d_v) # print('11',p_attn1.shape,p_attn2.shape, v.shape) # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) # print('12',x2.shape) - x2 = x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( - 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) + x2 = ( + x2.view(batch_size, self.num_heads, -1, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, -1, self.num_heads * self.d_v) + ) # (batch, time1, d_model) # print('12',x2.shape) - x[:,q_left_shift:-q_right_shift:] = x[:,q_left_shift:-q_right_shift:] + x2 - x = x[:,:t1] + x[:, q_left_shift:-q_right_shift:] = x[:, q_left_shift:-q_right_shift:] + x2 + x = x[:, :t1] return self.linear_out(x) # (batch, time1, d_model) - def forward1(self, query, key, value, mask): """Computes 'Local Scaled Dot Product Attention'. Args: - query: query with size=(batch, time1, in_feats), + query: query with size=(batch, time1, in_feats), where time1 is the output time dimension key: key with size=(batch, time2, in_feats) where time1 is the input time dimension value: value with size=(batch, time2, in_feats) - mask: optional mask with size=(batch, time1, time2), + mask: optional mask with size=(batch, time1, time2), to zero attention between some time steps. or (batch, time) if time1=time2 Returns: @@ -509,33 +563,31 @@ def forward1(self, query, key, value, mask): t1 = query.size(self.time_dim) t2 = key.size(self.time_dim) if t2 <= self.context: - return super().forward( - query, key, value, mask) + return super().forward(query, key, value, mask) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value) + query, key, value + ) # q0 size=(batch, time1, head * d_k) # k0 size=(batch, time2, head * d_k) # v0 size=(batch, time2, head * d_v) - - # compute block diagonal affinity matrix + # compute block diagonal affinity matrix # # print('3',q0.shape,k0.shape,v0.shape) # q = q0.view( # batch_size, -1, self.num_heads, self.d_k).transpose( # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) + # batch_size, self.num_heads, num_blocks, -1, self.d_k) # # (batch, head, blocks, time1, d_k) # k = k0.view( # batch_size, -1, self.num_heads, self.d_k).transpose( # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) + # batch_size, self.num_heads, num_blocks, -1, self.d_k) # # (batch, head, blocks time2, d_k) # # print('4',q.shape,k.shape) - # scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) - scores1 = self._compute_scores( - q0, k0, num_blocks, context_q, context_k, 0, 0) + # scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores1 = self._compute_scores(q0, k0, num_blocks, context_q, context_k, 0, 0) # (batch, head, blocks context_q, context_k) # print('5',scores1.shape) @@ -547,28 +599,28 @@ def forward1(self, query, key, value, mask): # q = q0[:,q_left_shift:-q_right_shift].view( # batch_size, -1, self.num_heads, self.d_k).transpose( # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) + # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) # # (batch, blocks-1, head, time1, d_k) # k = k0[:,k_left_shift:-k_right_shift].view( # batch_size, -1, self.num_heads, self.d_k).transpose( # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) + # batch_size, self.num_heads, num_blocks-1, -1, self.d_k) # # (batch, blocks-1, head, d_k) # # print('6',q.shape,k.shape) - # scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + # scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) scores2 = self._compute_scores( - q0, k0, num_blocks-1, context_q, context_k, - q_left_shift, k_left_shift) + q0, k0, num_blocks - 1, context_q, context_k, q_left_shift, k_left_shift + ) # (batch, head, blocks-1 context_q, context_k) # print('7',scores2.shape) - #combine both block diagonal affinity matrix to do the softmax + # combine both block diagonal affinity matrix to do the softmax # if mask is not None: # # put to -inf scores in points where mask==0 # if mask.dim() == 4: # # case when mask is 2d matrix per batch element - # mask = mask.eq(0) # (batch, time1, time2) + # mask = mask.eq(0) # (batch, time1, time2) # # first, we mask block diagonal blocks # scores1 = self._mask_scores_2d(scores1, mask, 0, 0) @@ -580,7 +632,7 @@ def forward1(self, query, key, value, mask): # # case when mask is 1d vector per batch element, # # meaning that time1 and time2 are the same, so mask is symmetric # mask = nn.functional.pad(mask, (0, pad2)) - # mask = mask.squeeze(1).eq(0) # (batch, 1, time) + # mask = mask.squeeze(1).eq(0) # (batch, 1, time) # # first, we mask block diagonal blocks # scores1 = self._mask_scores_1d(scores1, mask, 0, 0) @@ -591,8 +643,9 @@ def forward1(self, query, key, value, mask): # self.attn1, self.attn2 = self._softmax( # scores1, scores2, q_left_shift, k_left_shift, t1, t2) - self._compute_softmax(scores1, scores2, mask, - q_left_shift, k_left_shift, t1, t2) + self._compute_softmax( + scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 + ) return self._apply_attn(v0, t1) # if self.dropout_rate > 0: @@ -605,45 +658,44 @@ def forward1(self, query, key, value, mask): # v = v0.view( # batch_size, -1, self.num_heads, self.d_v).transpose( # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks, -1, self.d_k) + # batch_size, self.num_heads, num_blocks, -1, self.d_k) # # (batch, heads, blocks, time2, d_v) # # print('8',p_attn1.shape,p_attn2.shape, v.shape) # # (batch, blocks, head, time1, time2) x (batch, blocks, head, time2, d_v) # x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) # # print('9',x.shape) # x = x.view(batch_size, self.num_heads, -1, self.d_k).transpose( - # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) + # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) # # (batch, time1, d_model) # # print('10',x.shape) # v = v0[:,k_left_shift:-k_right_shift].view( # batch_size, -1, self.num_heads, self.d_v).transpose( # 1, 2).contiguous().view( - # batch_size, self.num_heads, num_blocks-1, -1, self.d_v) + # batch_size, self.num_heads, num_blocks-1, -1, self.d_v) # # (batch, blocks-1, head, time2, d_v) # # print('11',p_attn1.shape,p_attn2.shape, v.shape) # # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) # x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) # # print('12',x2.shape) # x2 = x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( - # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) + # 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) # # (batch, time1, d_model) # # print('12',x2.shape) # x[:,q_left_shift:-q_right_shift:] = x[:,q_left_shift:-q_right_shift:] + x2 # x = x[:,:t1] # return self.linear_out(x) # (batch, time1, d_model) - def forward2(self, query, key, value, mask): """Computes 'Local Scaled Dot Product Attention'. Args: - query: query with size=(batch, time1, in_feats), + query: query with size=(batch, time1, in_feats), where time1 is the output time dimension key: key with size=(batch, time2, in_feats) where time1 is the input time dimension value: value with size=(batch, time2, in_feats) - mask: optional mask with size=(batch, time1, time2), + mask: optional mask with size=(batch, time1, time2), to zero attention between some time steps. or (batch, time) if time1=time2 Returns: @@ -653,8 +705,7 @@ def forward2(self, query, key, value, mask): t1 = query.size(self.time_dim) t2 = key.size(self.time_dim) if t2 <= self.context: - return super().forward( - query, key, value, mask) + return super().forward(query, key, value, mask) if self.time_dim != 1: query = query.transpose(1, self.time_dim) @@ -662,12 +713,14 @@ def forward2(self, query, key, value, mask): value = value.transpose(1, self.time_dim) context_k = self.context - num_blocks = math.ceil(t2 / context_k) #(t2 + context_k//2)//context_k + num_blocks = math.ceil(t2 / context_k) # (t2 + context_k//2)//context_k context_q = math.ceil(t1 / num_blocks) - num_blocks_q = math.ceil(t1 / context_q) #(t1 + context_q//2)//context_q - assert num_blocks == num_blocks_q, ( - 'num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}'.format( - num_blocks, num_blocks_q, context_k, context_q, t1, t2)) + num_blocks_q = math.ceil(t1 / context_q) # (t1 + context_q//2)//context_q + assert ( + num_blocks == num_blocks_q + ), "num_blocks_k({})!=num_blocks_q({}), context_k={}, context_q={}, t1={}, t2={}".format( + num_blocks, num_blocks_q, context_k, context_q, t1, t2 + ) pad1 = context_q * num_blocks - t1 pad2 = context_k * num_blocks - t2 # print('1',query.shape,key.shape,value.shape,pad1,pad2, context_q, context_k) @@ -679,10 +732,9 @@ def forward2(self, query, key, value, mask): value = nn.functional.pad(value, (0, 0, 0, pad2)) # print('2',query.shape,key.shape,value.shape) - q0 = self.linear_q(query) # (batch, time1, head*d_k) - k0 = self.linear_k(key) # (batch, time2, head*d_k) - v0 = self.linear_v(value) # (batch, time2, head*d_v) - + q0 = self.linear_q(query) # (batch, time1, head*d_k) + k0 = self.linear_k(key) # (batch, time2, head*d_k) + v0 = self.linear_v(value) # (batch, time2, head*d_v) # # q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( # # query, key, value) @@ -690,22 +742,25 @@ def forward2(self, query, key, value, mask): # # # k0 size=(batch, time2, head*d_k) # # # v0 size=(batch, time2, head*d_v) - - # compute block diagonal affinity matrix + # compute block diagonal affinity matrix # # print('3',q0.shape,k0.shape,v0.shape) - q = q0.view( - batch_size, -1, self.num_heads, self.d_k).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks, -1, self.d_k) - # (batch, head, blocks, time1, d_k) - k = k0.view( - batch_size, -1, self.num_heads, self.d_k).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks, -1, self.d_k) - # (batch, head, blocks time2, d_k) + q = ( + q0.view(batch_size, -1, self.num_heads, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) + ) + # (batch, head, blocks, time1, d_k) + k = ( + k0.view(batch_size, -1, self.num_heads, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) + ) + # (batch, head, blocks time2, d_k) # # print('4',q.shape,k.shape) - scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores1 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # # scores1 = self._compute_scores( # # q0, k0, num_blocks, context_q, context_k, 0, 0) # (batch, head, blocks context_q, context_k) @@ -716,31 +771,37 @@ def forward2(self, query, key, value, mask): k_left_shift = context_k // 2 q_right_shift = context_q - q_left_shift k_right_shift = context_k - k_left_shift - q = q0[:,q_left_shift:-q_right_shift].view( - batch_size, -1, self.num_heads, self.d_k).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks-1, -1, self.d_k) - # (batch, blocks-1, head, time1, d_k) - k = k0[:,k_left_shift:-k_right_shift].view( - batch_size, -1, self.num_heads, self.d_k).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks-1, -1, self.d_k) + q = ( + q0[:, q_left_shift:-q_right_shift] + .view(batch_size, -1, self.num_heads, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_k) + ) + # (batch, blocks-1, head, time1, d_k) + k = ( + k0[:, k_left_shift:-k_right_shift] + .view(batch_size, -1, self.num_heads, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_k) + ) # # (batch, blocks-1, head, d_k) # # print('6',q.shape,k.shape) - scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) + scores2 = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.d_k) # scores2 = self._compute_scores( - # q0, k0, num_blocks-1, context_q, context_k, + # q0, k0, num_blocks-1, context_q, context_k, # q_left_shift, k_left_shift) # (batch, head, blocks-1 context_q, context_k) # print('7',scores2.shape) - #combine both block diagonal affinity matrix to do the softmax + # combine both block diagonal affinity matrix to do the softmax # if mask is not None: # # put to -inf scores in points where mask==0 # if mask.dim() == 4: # # case when mask is 2d matrix per batch element - # mask = mask.eq(0) # (batch, time1, time2) + # mask = mask.eq(0) # (batch, time1, time2) # # first, we mask block diagonal blocks # scores1 = self._mask_scores_2d(scores1, mask, 0, 0) @@ -752,7 +813,7 @@ def forward2(self, query, key, value, mask): # # case when mask is 1d vector per batch element, # # meaning that time1 and time2 are the same, so mask is symmetric # mask = nn.functional.pad(mask, (0, pad2)) - # mask = mask.squeeze(1).eq(0) # (batch, 1, time) + # mask = mask.squeeze(1).eq(0) # (batch, 1, time) # # first, we mask block diagonal blocks # scores1 = self._mask_scores_1d(scores1, mask, 0, 0) @@ -761,9 +822,10 @@ def forward2(self, query, key, value, mask): # scores2 = self._mask_scores_1d(scores2, mask, q_left_shift, k_left_shift) self.attn1, self.attn2 = self._softmax( - scores1, scores2, q_left_shift, k_left_shift, t1, t2) + scores1, scores2, q_left_shift, k_left_shift, t1, t2 + ) - # # self._compute_softmax(scores1, scores2, mask, + # # self._compute_softmax(scores1, scores2, mask, # # q_left_shift, k_left_shift, t1, t2) # # return self._apply_attn(v0, t1) @@ -774,48 +836,60 @@ def forward2(self, query, key, value, mask): p_attn1 = self.attn1 p_attn2 = self.attn2 - v = v0.view( - batch_size, -1, self.num_heads, self.d_v).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks, -1, self.d_k) + v = ( + v0.view(batch_size, -1, self.num_heads, self.d_v) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks, -1, self.d_k) + ) # (batch, heads, blocks, time2, d_v) # print('8',p_attn1.shape,p_attn2.shape, v.shape) # (batch, blocks, head, time1, time2) x (batch, blocks, head, time2, d_v) x = torch.matmul(p_attn1, v) # (batch, heads, blocks, time1, d_k) # print('9',x.shape) - x = x.view(batch_size, self.num_heads, -1, self.d_k).transpose( - 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) + x = ( + x.view(batch_size, self.num_heads, -1, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, -1, self.num_heads * self.d_v) + ) # (batch, time1, d_model) # print('10',x.shape) - v = v0[:,k_left_shift:-k_right_shift].view( - batch_size, -1, self.num_heads, self.d_v).transpose( - 1, 2).contiguous().view( - batch_size, self.num_heads, num_blocks-1, -1, self.d_v) + v = ( + v0[:, k_left_shift:-k_right_shift] + .view(batch_size, -1, self.num_heads, self.d_v) + .transpose(1, 2) + .contiguous() + .view(batch_size, self.num_heads, num_blocks - 1, -1, self.d_v) + ) # (batch, blocks-1, head, time2, d_v) # print('11',p_attn1.shape,p_attn2.shape, v.shape) # (batch, blocks-1, head, time1, time2) x (batch, blocks-1, head, time2, d_v) x2 = torch.matmul(p_attn2, v) # (batch, heads, blocks-1, time1, d_k) # print('12',x2.shape) - x2 = x2.view(batch_size, self.num_heads, -1, self.d_k).transpose( - 1, 2).contiguous().view(batch_size, -1, self.num_heads * self.d_v) + x2 = ( + x2.view(batch_size, self.num_heads, -1, self.d_k) + .transpose(1, 2) + .contiguous() + .view(batch_size, -1, self.num_heads * self.d_v) + ) # (batch, time1, d_model) # print('12',x2.shape) - x[:,q_left_shift:-q_right_shift:] = x[:,q_left_shift:-q_right_shift:] + x2 - x = x[:,:t1] + x[:, q_left_shift:-q_right_shift:] = x[:, q_left_shift:-q_right_shift:] + x2 + x = x[:, :t1] return self.linear_out(x) # (batch, time1, d_model) - def forward(self, query, key, value, mask): """Computes 'Local Scaled Dot Product Attention'. Args: - query: query with size=(batch, time1, in_feats), + query: query with size=(batch, time1, in_feats), where time1 is the output time dimension key: key with size=(batch, time2, in_feats) where time1 is the input time dimension value: value with size=(batch, time2, in_feats) - mask: optional mask with size=(batch, time1, time2), + mask: optional mask with size=(batch, time1, time2), to zero attention between some time steps. or (batch, time) if time1=time2 Returns: @@ -826,38 +900,37 @@ def forward(self, query, key, value, mask): t2 = key.size(self.time_dim) if t2 <= 2 * self.context: - return super().forward( - query, key, value, mask) + return super().forward(query, key, value, mask) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value) + query, key, value + ) # q0 size=(batch, time1, head*d_k) # k0 size=(batch, time2, head*d_k) # v0 size=(batch, time2, head*d_v) - # compute block diagonal affinity matrix - scores1 = self._compute_scores( - q0, k0, num_blocks, context_q, context_k, 0, 0) + # compute block diagonal affinity matrix + scores1 = self._compute_scores(q0, k0, num_blocks, context_q, context_k, 0, 0) # (batch, head, blocks context_q, context_k) # compute shifted block diagonal affinity matrix q_left_shift = context_q // 2 k_left_shift = context_k // 2 scores2 = self._compute_scores( - q0, k0, num_blocks-1, context_q, context_k, - q_left_shift, k_left_shift) + q0, k0, num_blocks - 1, context_q, context_k, q_left_shift, k_left_shift + ) # (batch, head, blocks-1 context_q, context_k) - #combine both block diagonal affinity matrix to do the softmax - self._compute_softmax(scores1, scores2, mask, - q_left_shift, k_left_shift, t1, t2) + # combine both block diagonal affinity matrix to do the softmax + self._compute_softmax( + scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 + ) return self._apply_attn(v0, t1) - class ScaledDotProdAttRelPosEncV1(ScaledDotProdAttV1): """Scaled dot product multihead attention layer - with relative positional encoders as defined in + with relative positional encoders as defined in https://arxiv.org/pdf/1901.02860.pdf Attributes: @@ -868,14 +941,30 @@ class ScaledDotProdAttRelPosEncV1(ScaledDotProdAttV1): d_v: value projection dimension causal_pos_enc: positional encoder is 0 for attending future frames. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input + time_dim: time dimension in the input, default=1 meaning input dimensions are (batch, time, in_feats) """ - def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, - causal_pos_enc=False, dropout_rate=0, time_dim=1): - super().__init__(in_feats, out_feats, num_heads, d_k, d_v, - dropout_rate=dropout_rate, time_dim=time_dim) + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + causal_pos_enc=False, + dropout_rate=0, + time_dim=1, + ): + super().__init__( + in_feats, + out_feats, + num_heads, + d_k, + d_v, + dropout_rate=dropout_rate, + time_dim=time_dim, + ) self.linear_pos = nn.Linear(in_feats, num_heads * d_k) # u, v in paper, Sec 3.3, 2nd eq. @@ -886,65 +975,67 @@ def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, nn.init.xavier_uniform_(self.v) self.causal_pos_enc = causal_pos_enc - + self._tril = None self._tril_diag = 0 self._triu = None self._triu_diag = 0 - def _apply_tril(self, x): - """ Applies lower triangular mask to (Q + v^T) W R_{i-j} attention matrix - to keep causal attention points, i.e., i-j >= 0 - E.g., - if t1=3, t2=4 this will apply a mask - [1 1 0 0; - 1 1 1 0; - 1 1 1 1 ] + """Applies lower triangular mask to (Q + v^T) W R_{i-j} attention matrix + to keep causal attention points, i.e., i-j >= 0 + E.g., + if t1=3, t2=4 this will apply a mask + [1 1 0 0; + 1 1 1 0; + 1 1 1 1 ] """ diag = x.size(3) - x.size(2) - if (self._tril is None or - self._tril.size(2) < x.size(2) or self._tril.size(3) < x.size(3) or - self._tril_diag != diag): + if ( + self._tril is None + or self._tril.size(2) < x.size(2) + or self._tril.size(3) < x.size(3) + or self._tril_diag != diag + ): # in these cases we need to recompute the lower triangular mask ones = torch.ones((x.size(2), x.size(3)), dtype=x.dtype, device=x.device) self._tril = torch.tril(ones, diag)[None, None, :, :] self._tril_diag = diag tril = self._tril else: - tril = self._tril[:,:,:x.size(2),:x.size(3)] + tril = self._tril[:, :, : x.size(2), : x.size(3)] return x * tril - def _apply_triu(self, x): - """ Applies upper triangular mask to (Q + v^T) W R_{i-j} attention matrix + """Applies upper triangular mask to (Q + v^T) W R_{i-j} attention matrix to keep non-causal attention points, i.e., i-j < 0 E.g., if t1=3, t2=4 this will apply a mask [0 0 1 1; 0 0 0 1; - 0 0 0 0 ] + 0 0 0 0 ] """ - #we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice - diag = x.size(3) - x.size(2) + 1 - if (self._triu is None or - self._triu.size(2) < x.size(2) or self._triu.size(3) < x.size(3) or - self._triu_diag != diag): + # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice + diag = x.size(3) - x.size(2) + 1 + if ( + self._triu is None + or self._triu.size(2) < x.size(2) + or self._triu.size(3) < x.size(3) + or self._triu_diag != diag + ): # in these cases we need to recompute the lower triangular mask ones = torch.ones((x.size(2), x.size(3)), dtype=x.dtype, device=x.device) self._triu = torch.triu(ones, diag)[None, None, :, :] self._triu_diag = diag triu = self._triu else: - triu = self._triu[:,:,-x.size(2):,-x.size(3):] + triu = self._triu[:, :, -x.size(2) :, -x.size(3) :] return x * triu - - def _left_shift(self, x): - """ Applies left shifts to the rows of x + """Applies left shifts to the rows of x to get scores with relative pos encodings R_{i-j} i-j >=0, causal attention @@ -958,14 +1049,13 @@ def _left_shift(self, x): q1 R2, q1 R1, q1 R0, 0 ; q2 R3, q2 R2, q2 R1, q2 R0] """ - x_pad = nn.functional.pad(x, (1, 0), mode='constant', value=0) + x_pad = nn.functional.pad(x, (1, 0), mode="constant", value=0) x_pad = x_pad.view(*x.size()[:2], x.size(3) + 1, x.size(2)) x = x_pad[:, :, 1:].view_as(x) return self._apply_tril(x) - def _right_shift(self, x): - """ Applies right shifts to the rows of x + """Applies right shifts to the rows of x to get scores with relative pos encodings R_{i-j} i-j < 0, non-causal attention @@ -979,24 +1069,22 @@ def _right_shift(self, x): 0, 0 , q1 R_{-1}; 0, 0 , 0 ] """ - x_pad = nn.functional.pad(x, (0, 1), mode='constant', value=0) + x_pad = nn.functional.pad(x, (0, 1), mode="constant", value=0) x_pad = x_pad.view(*x.size()[:2], x.size(3) + 1, x.size(2)) x = x_pad[:, :, :-1].view_as(x) return self._apply_triu(x) - - def forward(self, query, key, value, pos_emb=None, mask=None): """Computes 'Scaled Dot Product Attention'. Args: - query: query with size=(batch, time1, in_feats), + query: query with size=(batch, time1, in_feats), where time1 is the output time dimension key: key with size=(batch, time2, in_feats) where time1 is the input time dimension value: value with size=(batch, time2, in_feats) pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 - mask: optional mask with size=(batch, time1, time2), + mask: optional mask with size=(batch, time1, time2), to zero attention between some time steps or size=(batch, time) to make time1=time2 Returns: @@ -1004,21 +1092,23 @@ def forward(self, query, key, value, pos_emb=None, mask=None): """ batch_size = value.size(0) q, k, v = self._compute_qkv(query, key, value) - + pos_batch_size = pos_emb.size(0) p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, self.d_k) p = p.transpose(1, 2) # (batch, head, time2, d_k) q = q.transpose(1, 2) # (batch, time1, head, d_k) - q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) - q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + q_plus_u = (q + self.u).transpose(1, 2) # (batch, head, time1, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) # compute A(a) + A(c) in Sec3.3, 2nd Eq. - AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) + AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part # This is the sum of Btilde and Dtilde in the Appendix of the paper - BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) # (batch, head, time1, time2) + BDtilde = torch.matmul( + q_plus_v, p.transpose(-2, -1) + ) # (batch, head, time1, time2) # apply left shift as indicated in the Appendix to geth B+D BD = self._left_shift(BDtilde) @@ -1027,29 +1117,35 @@ def forward(self, query, key, value, pos_emb=None, mask=None): # this is not included in the paper because it doesn't allow to attent to future postions # we assume that t2 >= t1 dt = key.size(1) - query.size(1) - pos_emb_noncausal = pos_emb[:,dt:].flip(dims=(1,)) # we flip to get R_0, ..., R_{L-1} - pos_emb_noncausal[:,:,0::2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} - assert pos_emb[0, -2, 0] == - pos_emb_noncausal[0, 1, 0] + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1,) + ) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[ + :, :, 0::2 + ] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] p = self.linear_pos(pos_emb_noncausal).view( - pos_batch_size, -1, self.num_heads, self.d_k) + pos_batch_size, -1, self.num_heads, self.d_k + ) p = p.transpose(1, 2) # (batch, head, time2-dt, d_k) - BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) # (batch, head, time1, time2-dt) - BD_noncausal = self._right_shift(BDtilde) - BD[:,:,:,dt:] += BD_noncausal + BDtilde = torch.matmul( + q_plus_v, p.transpose(-2, -1) + ) # (batch, head, time1, time2-dt) + BD_noncausal = self._right_shift(BDtilde) + BD[:, :, :, dt:] += BD_noncausal - #add and normalize + # add and normalize scores = (AC + BD) / math.sqrt(self.d_k) # (batch, head, time1, time2) self.attn = self._compute_softmax(scores, mask) return self._apply_attn(v) - class LocalScaledDotProdAttRelPosEncV1(LocalScaledDotProdAttV1): """Local Scaled dot product multihead attention layer It calculates self-attention between time steps within a window of 'context' frames. - It uses relative positional encoders as defined in + It uses relative positional encoders as defined in https://arxiv.org/pdf/1901.02860.pdf Attributes: @@ -1061,14 +1157,32 @@ class LocalScaledDotProdAttRelPosEncV1(LocalScaledDotProdAttV1): context: maximum attention temporal context. causal_pos_enc: positional encoder is 0 for attending future frames. dropout_rate: dropout rate - time_dim: time dimension in the input, default=1 meaning input + time_dim: time dimension in the input, default=1 meaning input dimensions are (batch, time, in_feats) """ - def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, - context=25, causal_pos_enc=False, dropout_rate=0, time_dim=1): - super().__init__(in_feats, out_feats, num_heads, d_k, d_v, - context, dropout_rate=dropout_rate, time_dim=time_dim) + def __init__( + self, + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context=25, + causal_pos_enc=False, + dropout_rate=0, + time_dim=1, + ): + super().__init__( + in_feats, + out_feats, + num_heads, + d_k, + d_v, + context, + dropout_rate=dropout_rate, + time_dim=time_dim, + ) self.linear_pos = nn.Linear(in_feats, num_heads * d_k) # u, v in paper, Sec 3.3, 2nd eq. @@ -1079,65 +1193,67 @@ def __init__(self, in_feats, out_feats, num_heads, d_k, d_v, nn.init.xavier_uniform_(self.v) self.causal_pos_enc = causal_pos_enc - + self._tril = None self._tril_diag = 0 self._triu = None self._triu_diag = 0 - def _apply_tril(self, x): - """ Applies lower triangular mask to (Q + v^T) W R_{i-j} attention matrix - to keep causal attention points, i.e., i-j >= 0 - E.g., - if t1=3, t2=4 this will apply a mask - [1 1 0 0; - 1 1 1 0; - 1 1 1 1 ] + """Applies lower triangular mask to (Q + v^T) W R_{i-j} attention matrix + to keep causal attention points, i.e., i-j >= 0 + E.g., + if t1=3, t2=4 this will apply a mask + [1 1 0 0; + 1 1 1 0; + 1 1 1 1 ] """ diag = x.size(4) - x.size(3) - if (self._tril is None or - self._tril.size(3) < x.size(3) or self._tril.size(4) < x.size(4) or - self._tril_diag != diag): + if ( + self._tril is None + or self._tril.size(3) < x.size(3) + or self._tril.size(4) < x.size(4) + or self._tril_diag != diag + ): # in these cases we need to recompute the lower triangular mask ones = torch.ones((x.size(3), x.size(4)), dtype=x.dtype, device=x.device) self._tril = torch.tril(ones, diag)[None, None, None, :, :] self._tril_diag = diag tril = self._tril else: - tril = self._tril[:,:,:,:x.size(3),:x.size(4)] + tril = self._tril[:, :, :, : x.size(3), : x.size(4)] return x * tril - def _apply_triu(self, x): - """ Applies upper triangular mask to (Q + v^T) W R_{i-j} attention matrix + """Applies upper triangular mask to (Q + v^T) W R_{i-j} attention matrix to keep non-causal attention points, i.e., i-j < 0 E.g., if t1=3, t2=4 this will apply a mask [0 0 1 1; 0 0 0 1; - 0 0 0 0 ] + 0 0 0 0 ] """ - #we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice - diag = x.size(4) - x.size(3) + 1 - if (self._triu is None or - self._triu.size(3) < x.size(3) or self._triu.size(4) < x.size(4) or - self._triu_diag != diag): + # we add 1 to put the diagonal to 0 so we don't count the R_0 embedding twice + diag = x.size(4) - x.size(3) + 1 + if ( + self._triu is None + or self._triu.size(3) < x.size(3) + or self._triu.size(4) < x.size(4) + or self._triu_diag != diag + ): # in these cases we need to recompute the lower triangular mask ones = torch.ones((x.size(3), x.size(4)), dtype=x.dtype, device=x.device) self._triu = torch.triu(ones, diag)[None, None, None, :, :] self._triu_diag = diag triu = self._triu else: - triu = self._triu[:,:,:,-x.size(3):,-x.size(4):] + triu = self._triu[:, :, :, -x.size(3) :, -x.size(4) :] return x * triu - - def _left_shift(self, x, context, left_shift): - """ Applies left shifts to the rows of x + """Applies left shifts to the rows of x to get scores with relative pos encodings R_{i-j} i-j >=0, causal attention @@ -1153,17 +1269,16 @@ def _left_shift(self, x, context, left_shift): """ if left_shift > 0: right_shift = context - left_shift - x = x[:,:,left_shift:-right_shift] + x = x[:, :, left_shift:-right_shift] x = x.view(x.size(0), x.size(1), -1, context, x.size(-1)) - x_pad = nn.functional.pad(x, (1, 0), mode='constant', value=0) + x_pad = nn.functional.pad(x, (1, 0), mode="constant", value=0) x_pad = x_pad.view(*x.size()[:3], x.size(4) + 1, x.size(3)) - x = x_pad[:,:,:,1:].view_as(x) + x = x_pad[:, :, :, 1:].view_as(x) return self._apply_tril(x) - def _right_shift(self, x, context, left_shift): - """ Applies right shifts to the rows of x + """Applies right shifts to the rows of x to get scores with relative pos encodings R_{i-j} i-j < 0, non-causal attention @@ -1179,27 +1294,25 @@ def _right_shift(self, x, context, left_shift): """ if left_shift > 0: right_shift = context - left_shift - x = x[:,:,left_shift:-right_shift] + x = x[:, :, left_shift:-right_shift] x = x.view(x.size(0), x.size(1), -1, context, x.size(-1)) - x_pad = nn.functional.pad(x, (0, 1), mode='constant', value=0) + x_pad = nn.functional.pad(x, (0, 1), mode="constant", value=0) x_pad = x_pad.view(*x.size()[:3], x.size(4) + 1, x.size(3)) - x = x_pad[:,:,:,:-1].view_as(x) + x = x_pad[:, :, :, :-1].view_as(x) return self._apply_triu(x) - - def forward(self, query, key, value, pos_emb=None, mask=None): """Computes 'Scaled Dot Product Attention'. Args: - query: query with size=(batch, time1, in_feats), + query: query with size=(batch, time1, in_feats), where time1 is the output time dimension key: key with size=(batch, time2, in_feats) where time1 is the input time dimension value: value with size=(batch, time2, in_feats) pos_emb: positional embedding size=(batch, time2, in_feats) as R_{L-1}, ..., R_0 - mask: optional mask with size=(batch, time1, time2), + mask: optional mask with size=(batch, time1, time2), to zero attention between some time steps or size=(batch, time) to make time1=time2 Returns: @@ -1209,81 +1322,106 @@ def forward(self, query, key, value, pos_emb=None, mask=None): t1 = query.size(self.time_dim) t2 = key.size(self.time_dim) q0, k0, v0, context_q, context_k, num_blocks = self._compute_qkv0( - query, key, value) + query, key, value + ) # q0 size=(batch, time1, head*d_k) # k0 size=(batch, time2, head*d_k) # v0 size=(batch, time2, head*d_v) - q_plus_u0 = q0 + self.u.view(-1, q0.size(-1)) # (batch, time1, head*d_k) + q_plus_u0 = q0 + self.u.view(-1, q0.size(-1)) # (batch, time1, head*d_k) # q = q.transpose(1, 2) # (batch, time1, head, d_k) # q_plus_u = (q + self.u).transpose(1, 2) #(batch, head, time1, d_k) # q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) # compute A(a) + A(c) in Sec3.3, 2nd Eq. block diagonals - # 1) compute block diagonal affinity matrix + # 1) compute block diagonal affinity matrix AC1 = self._compute_scores( - q_plus_u0, k0, num_blocks, context_q, context_k, 0, 0) + q_plus_u0, k0, num_blocks, context_q, context_k, 0, 0 + ) # (batch, head, blocks, context_q, context_k) # 2) compute shifted block diagonal matrix q_left_shift = context_q // 2 k_left_shift = context_k // 2 AC2 = self._compute_scores( - q_plus_u0, k0, num_blocks-1, context_q, context_k, - q_left_shift, k_left_shift) + q_plus_u0, + k0, + num_blocks - 1, + context_q, + context_k, + q_left_shift, + k_left_shift, + ) # (batch, head, blocks-1, context_q, context_k) - #AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) + # AC = torch.matmul(q_plus_u, k.transpose(-2, -1)) # (batch, head, time1, time2) - pos_emb = pos_emb[:,-context_k:] # (1, context_k, d_model) + pos_emb = pos_emb[:, -context_k:] # (1, context_k, d_model) pos_batch_size = pos_emb.size(0) p = self.linear_pos(pos_emb).view(pos_batch_size, -1, self.num_heads, self.d_k) p = p.transpose(1, 2) # (1, head, context_k, d_k) - - q = q0.view(batch_size, -1, self.num_heads, self.d_k) # (batch, time1, head, d_k) - q_plus_v = (q + self.v).transpose(1, 2) #(batch, head, time1, d_k) + q = q0.view( + batch_size, -1, self.num_heads, self.d_k + ) # (batch, time1, head, d_k) + q_plus_v = (q + self.v).transpose(1, 2) # (batch, head, time1, d_k) # compute A(b) + A(d) in Sec3.3, 2nd Eq. for the causal part # This is the sum of Btilde and Dtilde in the Appendix of the paper - BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt(self.d_k) # (batch, head, time1, context_k) - # apply left shift as indicated in the Appendix to geth B+D + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( + self.d_k + ) # (batch, head, time1, context_k) + # apply left shift as indicated in the Appendix to geth B+D # 1) block-diagonal part of BD: BD1 - BD1 = self._left_shift(BDtilde, context_q, 0) # (batch, head, blocks, context_q, context_k) + BD1 = self._left_shift( + BDtilde, context_q, 0 + ) # (batch, head, blocks, context_q, context_k) # 2) shifted block diagonal part of BD: BD2 - BD2 = self._left_shift(BDtilde, context_q, q_left_shift) # (batch, head, blocks-1, context_q, context_k) - - #print('BD\n',BD1[0,0,0,:10,:10]) - #print(BD2[0,0,0,:10,:10]) + BD2 = self._left_shift( + BDtilde, context_q, q_left_shift + ) # (batch, head, blocks-1, context_q, context_k) + + # print('BD\n',BD1[0,0,0,:10,:10]) + # print(BD2[0,0,0,:10,:10]) if not self.causal_pos_enc: # compute A(b) + A(d) for the non-causal part, # this is not included in the paper because it doesn't allow to attent to future postions # we assume that t2 >= t1, and therefore context_k >= context_q - dt = context_k - context_q - pos_emb_noncausal = pos_emb[:,dt:].flip(dims=(1,)) # we flip to get R_0, ..., R_{L-1} - pos_emb_noncausal[:,:,0::2] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} - assert pos_emb[0, -2, 0] == - pos_emb_noncausal[0, 1, 0] + dt = context_k - context_q + pos_emb_noncausal = pos_emb[:, dt:].flip( + dims=(1,) + ) # we flip to get R_0, ..., R_{L-1} + pos_emb_noncausal[ + :, :, 0::2 + ] *= -1 # we multiply sin emb by -1 to get R_0, R_{-1}, ..., R_{-(L-1)} + assert pos_emb[0, -2, 0] == -pos_emb_noncausal[0, 1, 0] p = self.linear_pos(pos_emb_noncausal).view( - pos_batch_size, -1, self.num_heads, self.d_k) + pos_batch_size, -1, self.num_heads, self.d_k + ) p = p.transpose(1, 2) # (batch, head, context_k-dt, d_k) - BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt(self.d_k) # (batch, head, time1, context_k-dt) - BD_noncausal1 = self._right_shift(BDtilde, context_q, 0) # (batch, head, blocks, context_q, context_k-dt) - BD_noncausal2 = self._right_shift(BDtilde, context_q, q_left_shift) # (batch, head, blocks-1, context_q, context_k-dt) - #print(BD_noncausal1[0,0,0,:10,:10]) - #print(BD_noncausal2[0,0,0,:10,:10]) - #print('BDshapes', BD1.shape, BD_noncausal1.shape, BD2.shape, BD_noncausal2.shape, BDtilde.shape, dt, context_k, context_q) - BD1[:,:,:,:,dt:] += BD_noncausal1 - BD2[:,:,:,:,dt:] += BD_noncausal2 - - #print(BD1[0,0,0,:10,:10]) - #print(BD2[0,0,0,:10,:10]) + BDtilde = torch.matmul(q_plus_v, p.transpose(-2, -1)) / math.sqrt( + self.d_k + ) # (batch, head, time1, context_k-dt) + BD_noncausal1 = self._right_shift( + BDtilde, context_q, 0 + ) # (batch, head, blocks, context_q, context_k-dt) + BD_noncausal2 = self._right_shift( + BDtilde, context_q, q_left_shift + ) # (batch, head, blocks-1, context_q, context_k-dt) + # print(BD_noncausal1[0,0,0,:10,:10]) + # print(BD_noncausal2[0,0,0,:10,:10]) + # print('BDshapes', BD1.shape, BD_noncausal1.shape, BD2.shape, BD_noncausal2.shape, BDtilde.shape, dt, context_k, context_q) + BD1[:, :, :, :, dt:] += BD_noncausal1 + BD2[:, :, :, :, dt:] += BD_noncausal2 + + # print(BD1[0,0,0,:10,:10]) + # print(BD2[0,0,0,:10,:10]) # add AC and BD for block-diag s - scores1 = AC1 + BD1 # (batch, head, blocks, context_q, context_k) - scores2 = AC2 + BD2 # (batch, head, blocks-1, context_q, context_k) - self._compute_softmax(scores1, scores2, mask, - q_left_shift, k_left_shift, t1, t2) + scores1 = AC1 + BD1 # (batch, head, blocks, context_q, context_k) + scores2 = AC2 + BD2 # (batch, head, blocks-1, context_q, context_k) + self._compute_softmax( + scores1, scores2, mask, q_left_shift, k_left_shift, t1, t2 + ) return self._apply_attn(v0, t1) - - diff --git a/hyperion/torch/layers/audio_feats.py b/hyperion/torch/layers/audio_feats.py index 9629454c..d435ebbd 100644 --- a/hyperion/torch/layers/audio_feats.py +++ b/hyperion/torch/layers/audio_feats.py @@ -12,10 +12,12 @@ import torch import torch.nn as nn import torch.cuda.amp as amp + try: from torch.fft import rfft as torch_rfft + _rfft = lambda x: torch_rfft(x, dim=-1) - _pow_spectrogram = lambda x: x.abs()**2 + _pow_spectrogram = lambda x: x.abs() ** 2 _spectrogram = lambda x: x.abs() except: _rfft = lambda x: torch.rfft(x, 1, normalized=False, onesided=True) @@ -25,11 +27,11 @@ from ...feats.filter_banks import FilterBankFactory as FBF # window types -HAMMING = 'hamming' -HANNING = 'hanning' -POVEY = 'povey' -RECTANGULAR = 'rectangular' -BLACKMAN = 'blackman' +HAMMING = "hamming" +HANNING = "hanning" +POVEY = "povey" +RECTANGULAR = "rectangular" +BLACKMAN = "blackman" WINDOWS = [HAMMING, HANNING, POVEY, RECTANGULAR, BLACKMAN] # def _amp_safe_matmul(a, b): @@ -40,41 +42,32 @@ # return torch.matmul(a, b) -def _get_feature_window_function(window_type, - window_size, - blackman_coeff=0.42): - r"""Returns a window function with the given type and size - """ +def _get_feature_window_function(window_type, window_size, blackman_coeff=0.42): + r"""Returns a window function with the given type and size""" if window_type == HANNING: return torch.hann_window(window_size, periodic=True) elif window_type == HAMMING: - return torch.hamming_window(window_size, - periodic=True, - alpha=0.54, - beta=0.46) + return torch.hamming_window(window_size, periodic=True, alpha=0.54, beta=0.46) elif window_type == POVEY: - #return torch.hann_window(window_size, periodic=True).pow(0.85) + # return torch.hann_window(window_size, periodic=True).pow(0.85) a = 2 * math.pi / window_size - window_function = torch.arange(window_size, - dtype=torch.get_default_dtype()) + window_function = torch.arange(window_size, dtype=torch.get_default_dtype()) return (0.5 - 0.5 * torch.cos(a * window_function)).pow(0.85) elif window_type == RECTANGULAR: return torch.ones(window_size, dtype=torch.get_default_dtype()) elif window_type == BLACKMAN: a = 2 * math.pi / window_size - window_function = torch.arange(window_size, - dtype=torch.get_default_dtype()) - return blackman_coeff - 0.5 * torch.cos(a * window_function) + \ - (0.5 - blackman_coeff) * torch.cos(2 * a * window_function) + window_function = torch.arange(window_size, dtype=torch.get_default_dtype()) + return ( + blackman_coeff + - 0.5 * torch.cos(a * window_function) + + (0.5 - blackman_coeff) * torch.cos(2 * a * window_function) + ) else: - raise Exception('Invalid window type ' + window_type) + raise Exception("Invalid window type " + window_type) -def _get_strided_batch(waveform, - window_length, - window_shift, - snip_edges, - center=False): +def _get_strided_batch(waveform, window_length, window_shift, snip_edges, center=False): r"""Given a waveform (1D tensor of size ``num_samples``), it returns a 2D tensor (m, ``window_size``) representing how the window is shifted along the waveform. Each row is a frame. @@ -107,8 +100,7 @@ def _get_strided_batch(waveform, npad_left = int(window_length // 2) npad_right = npad_left npad = 2 * npad_left - num_frames = 1 + (num_samples + npad - - window_length) // window_shift + num_frames = 1 + (num_samples + npad - window_length) // window_shift else: num_frames = (num_samples + (window_shift // 2)) // window_shift new_num_samples = (num_frames - 1) * window_shift + window_length @@ -116,45 +108,49 @@ def _get_strided_batch(waveform, npad_left = int((window_length - window_shift) // 2) npad_right = npad - npad_left - #waveform = nn.functional.pad(waveform, (npad_left, npad_right), mode='reflect') - pad_left = torch.flip(waveform[:, 1:npad_left + 1], (1, )) - pad_right = torch.flip(waveform[:, -npad_right - 1:-1], (1, )) + # waveform = nn.functional.pad(waveform, (npad_left, npad_right), mode='reflect') + pad_left = torch.flip(waveform[:, 1 : npad_left + 1], (1,)) + pad_right = torch.flip(waveform[:, -npad_right - 1 : -1], (1,)) waveform = torch.cat((pad_left, waveform, pad_right), dim=1) - strides = (waveform.stride(0), window_shift * waveform.stride(1), - waveform.stride(1)) + strides = ( + waveform.stride(0), + window_shift * waveform.stride(1), + waveform.stride(1), + ) sizes = (batch_size, num_frames, window_length) return waveform.as_strided(sizes, strides) def _get_log_energy(x, energy_floor): - r"""Returns the log energy of size (m) for a strided_input (m,*) - """ + r"""Returns the log energy of size (m) for a strided_input (m,*)""" log_energy = (x.pow(2).sum(-1) + 1e-15).log() # size (m) if energy_floor > 0.0: log_energy = torch.max( log_energy, - torch.tensor(math.log(energy_floor), - dtype=torch.get_default_dtype())) + torch.tensor(math.log(energy_floor), dtype=torch.get_default_dtype()), + ) return log_energy class Wav2Win(nn.Module): - def __init__(self, - fs=16000, - frame_length=25, - frame_shift=10, - pad_length=None, - remove_dc_offset=True, - preemph_coeff=0.97, - window_type='povey', - dither=1, - snip_edges=True, - center=False, - energy_floor=0, - raw_energy=True, - return_log_energy=False): + def __init__( + self, + fs=16000, + frame_length=25, + frame_shift=10, + pad_length=None, + remove_dc_offset=True, + preemph_coeff=0.97, + window_type="povey", + dither=1, + snip_edges=True, + center=False, + energy_floor=0, + raw_energy=True, + return_log_energy=False, + ): super().__init__() self.fs = fs @@ -175,9 +171,9 @@ def __init__(self, self._length = N self._shift = int(math.floor(frame_shift * fs / 1000)) - self._window = nn.Parameter(_get_feature_window_function( - window_type, N), - requires_grad=False) + self._window = nn.Parameter( + _get_feature_window_function(window_type, N), requires_grad=False + ) self.pad_length = N if pad_length is None else pad_length assert self.pad_length >= N @@ -186,14 +182,25 @@ def __repr__(self): def __str__(self): s = ( - '{}(fs={}, frame_length={}, frame_shift={}, pad_length={}, ' - 'remove_dc_offset={}, preemph_coeff={}, window_type={} ' - 'dither={}, snip_edges={}, center={}, energy_floor={}, raw_energy={}, return_log_energy={})' - ).format(self.__class__.__name__, self.fs, self.frame_length, - self.frame_shift, self.pad_length, self.remove_dc_offset, - self.preemph_coeff, self.window_type, self.dither, - self.snip_edges, self.center, self.energy_floor, - self.raw_energy, self.return_log_energy) + "{}(fs={}, frame_length={}, frame_shift={}, pad_length={}, " + "remove_dc_offset={}, preemph_coeff={}, window_type={} " + "dither={}, snip_edges={}, center={}, energy_floor={}, raw_energy={}, return_log_energy={})" + ).format( + self.__class__.__name__, + self.fs, + self.frame_length, + self.frame_shift, + self.pad_length, + self.remove_dc_offset, + self.preemph_coeff, + self.window_type, + self.dither, + self.snip_edges, + self.center, + self.energy_floor, + self.raw_energy, + self.return_log_energy, + ) return s def forward(self, x): @@ -203,46 +210,42 @@ def forward(self, x): n = torch.randn(x.shape, device=x.device) x = x + self.dither * n - #remove offset + # remove offset if self.remove_dc_offset: mu = torch.mean(x, dim=1, keepdim=True) x = x - mu if self.return_log_energy and self.raw_energy: # Compute the log energy of each frame - x_strided = _get_strided_batch(x, - self._length, - self._shift, - self.snip_edges, - center=self.center) - log_energy = _get_log_energy(x_strided, - self.energy_floor) # size (m) + x_strided = _get_strided_batch( + x, self._length, self._shift, self.snip_edges, center=self.center + ) + log_energy = _get_log_energy(x_strided, self.energy_floor) # size (m) if self.preemph_coeff != 0.0: - x_offset = torch.nn.functional.pad(x.unsqueeze(1), (1, 0), - mode='replicate').squeeze(1) + x_offset = torch.nn.functional.pad( + x.unsqueeze(1), (1, 0), mode="replicate" + ).squeeze(1) x = x - self.preemph_coeff * x_offset[:, :-1] - x_strided = _get_strided_batch(x, - self._length, - self._shift, - self.snip_edges, - center=self.center) + x_strided = _get_strided_batch( + x, self._length, self._shift, self.snip_edges, center=self.center + ) # Apply window_function to each frame x_strided = x_strided * self._window if self.return_log_energy and not self.raw_energy: signal_log_energy = _get_log_energy( - strided_input, self.energy_floor) # size (batch, m) + strided_input, self.energy_floor + ) # size (batch, m) # Pad columns with zero until we reach size (batch, num_frames, pad_length) if self.pad_length != self._length: pad = self.pad_length - self._length - x_strided = torch.nn.functional.pad(x_strided.unsqueeze(1), - (0, pad), - mode='constant', - value=0).squeeze(1) + x_strided = torch.nn.functional.pad( + x_strided.unsqueeze(1), (0, pad), mode="constant", value=0 + ).squeeze(1) if self.return_log_energy: return x_strided, log_energy @@ -251,41 +254,45 @@ def forward(self, x): class Wav2FFT(nn.Module): - def __init__(self, - fs=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemph_coeff=0.97, - window_type='povey', - dither=1, - snip_edges=True, - center=False, - energy_floor=0, - raw_energy=True, - use_energy=True): + def __init__( + self, + fs=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemph_coeff=0.97, + window_type="povey", + dither=1, + snip_edges=True, + center=False, + energy_floor=0, + raw_energy=True, + use_energy=True, + ): super().__init__() N = int(math.floor(frame_length * fs / 1000)) if N > fft_length: k = math.ceil(math.log(N) / math.log(2)) - self.fft_length = int(2**k) - - self.wav2win = Wav2Win(fs, - frame_length, - frame_shift, - pad_length=fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemph_coeff, - window_type=window_type, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=0, - raw_energy=raw_energy, - return_log_energy=use_energy) + self.fft_length = int(2 ** k) + + self.wav2win = Wav2Win( + fs, + frame_length, + frame_shift, + pad_length=fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemph_coeff, + window_type=window_type, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=0, + raw_energy=raw_energy, + return_log_energy=use_energy, + ) self.fft_length = fft_length self.use_energy = use_energy @@ -324,7 +331,7 @@ def forward(self, x): if self.use_energy: x_strided, log_e = x_strided - #X = torch.rfft(x_strided, 1, normalized=False, onesided=True) + # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) if self.use_energy: @@ -334,35 +341,39 @@ def forward(self, x): class Wav2Spec(Wav2FFT): - def __init__(self, - fs=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemph_coeff=0.97, - window_type='povey', - use_fft_mag=False, - dither=1, - snip_edges=True, - center=False, - energy_floor=0, - raw_energy=True, - use_energy=True): - - super().__init__(fs, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemph_coeff, - window_type=window_type, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + def __init__( + self, + fs=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemph_coeff=0.97, + window_type="povey", + use_fft_mag=False, + dither=1, + snip_edges=True, + center=False, + energy_floor=0, + raw_energy=True, + use_energy=True, + ): + + super().__init__( + fs, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemph_coeff, + window_type=window_type, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) self.use_fft_mag = use_fft_mag if use_fft_mag: @@ -376,7 +387,7 @@ def forward(self, x): if self.use_energy: x_strided, log_e = x_strided - #X = torch.rfft(x_strided, 1, normalized=False, onesided=True) + # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) # pow_spec = X.pow(2).sum(-1) @@ -390,35 +401,39 @@ def forward(self, x): class Wav2LogSpec(Wav2FFT): - def __init__(self, - fs=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemph_coeff=0.97, - window_type='povey', - use_fft_mag=False, - dither=1, - snip_edges=True, - center=False, - energy_floor=0, - raw_energy=True, - use_energy=True): - - super().__init__(fs, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemph_coeff, - window_type=window_type, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + def __init__( + self, + fs=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemph_coeff=0.97, + window_type="povey", + use_fft_mag=False, + dither=1, + snip_edges=True, + center=False, + energy_floor=0, + raw_energy=True, + use_energy=True, + ): + + super().__init__( + fs, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemph_coeff, + window_type=window_type, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) self.use_fft_mag = use_fft_mag if use_fft_mag: @@ -432,7 +447,7 @@ def forward(self, x): if self.use_energy: x_strided, log_e = x_strided - #X = torch.rfft(x_strided, 1, normalized=False, onesided=True) + # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) @@ -449,40 +464,44 @@ def forward(self, x): class Wav2LogFilterBank(Wav2FFT): - def __init__(self, - fs=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemph_coeff=0.97, - window_type='povey', - use_fft_mag=False, - dither=1, - fb_type='mel_kaldi', - low_freq=20, - high_freq=0, - num_filters=23, - norm_filters=False, - snip_edges=True, - center=False, - energy_floor=0, - raw_energy=True, - use_energy=True): - - super().__init__(fs, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemph_coeff, - window_type=window_type, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + def __init__( + self, + fs=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemph_coeff=0.97, + window_type="povey", + use_fft_mag=False, + dither=1, + fb_type="mel_kaldi", + low_freq=20, + high_freq=0, + num_filters=23, + norm_filters=False, + snip_edges=True, + center=False, + energy_floor=0, + raw_energy=True, + use_energy=True, + ): + + super().__init__( + fs, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemph_coeff, + window_type=window_type, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) self.use_fft_mag = use_fft_mag self.fb_type = fb_type @@ -491,11 +510,18 @@ def __init__(self, self.num_filters = num_filters self.norm_filters = norm_filters - fb = FBF.create(fb_type, num_filters, self.fft_length, self.fs, - low_freq, high_freq, norm_filters) - self._fb = nn.Parameter(torch.tensor(fb, - dtype=torch.get_default_dtype()), - requires_grad=False) + fb = FBF.create( + fb_type, + num_filters, + self.fft_length, + self.fs, + low_freq, + high_freq, + norm_filters, + ) + self._fb = nn.Parameter( + torch.tensor(fb, dtype=torch.get_default_dtype()), requires_grad=False + ) if use_fft_mag: self._to_spec = _spectrogram else: @@ -507,7 +533,7 @@ def forward(self, x): if self.use_energy: x_strided, log_e = x_strided - #X = torch.rfft(x_strided, 1, normalized=False, onesided=True) + # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) # logging.info('X={} {}'.format(X, X.type())) # logging.info('X={}'.format(X.type())) @@ -520,11 +546,11 @@ def forward(self, x): with amp.autocast(enabled=False): pow_spec = torch.matmul(pow_spec.float(), self._fb.float()) - #logging.info('fb={} {}'.format(pow_spec, pow_spec.type())) - #logging.info('fb={}'.format(pow_spec.type())) + # logging.info('fb={} {}'.format(pow_spec, pow_spec.type())) + # logging.info('fb={}'.format(pow_spec.type())) pow_spec = (pow_spec + 1e-10).log() - #logging.info('lfb={} {}'.format(pow_spec, pow_spec.type())) - #logging.info('lfb={}'.format(pow_spec.type())) + # logging.info('lfb={} {}'.format(pow_spec, pow_spec.type())) + # logging.info('lfb={}'.format(pow_spec.type())) if self.use_energy: pow_spec = torch.cat((log_e.unsqueeze(-1), pow_spec), dim=-1) @@ -532,42 +558,46 @@ def forward(self, x): class Wav2MFCC(Wav2FFT): - def __init__(self, - fs=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemph_coeff=0.97, - window_type='povey', - use_fft_mag=False, - dither=1, - fb_type='mel_kaldi', - low_freq=20, - high_freq=0, - num_filters=23, - norm_filters=False, - num_ceps=13, - snip_edges=True, - center=False, - cepstral_lifter=22, - energy_floor=0, - raw_energy=True, - use_energy=True): - - super().__init__(fs, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemph_coeff, - window_type=window_type, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + def __init__( + self, + fs=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemph_coeff=0.97, + window_type="povey", + use_fft_mag=False, + dither=1, + fb_type="mel_kaldi", + low_freq=20, + high_freq=0, + num_filters=23, + norm_filters=False, + num_ceps=13, + snip_edges=True, + center=False, + cepstral_lifter=22, + energy_floor=0, + raw_energy=True, + use_energy=True, + ): + + super().__init__( + fs, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemph_coeff, + window_type=window_type, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) self.use_fft_mag = use_fft_mag self.fb_type = fb_type @@ -578,17 +608,24 @@ def __init__(self, self.num_ceps = num_ceps self.cepstral_lifter = cepstral_lifter - fb = FBF.create(fb_type, num_filters, self.fft_length, self.fs, - low_freq, high_freq, norm_filters) - self._fb = nn.Parameter(torch.tensor(fb, - dtype=torch.get_default_dtype()), - requires_grad=False) - self._dct = nn.Parameter(self.make_dct_matrix(self.num_ceps, - self.num_filters), - requires_grad=False) - self._lifter = nn.Parameter(self.make_lifter(self.num_ceps, - self.cepstral_lifter), - requires_grad=False) + fb = FBF.create( + fb_type, + num_filters, + self.fft_length, + self.fs, + low_freq, + high_freq, + norm_filters, + ) + self._fb = nn.Parameter( + torch.tensor(fb, dtype=torch.get_default_dtype()), requires_grad=False + ) + self._dct = nn.Parameter( + self.make_dct_matrix(self.num_ceps, self.num_filters), requires_grad=False + ) + self._lifter = nn.Parameter( + self.make_lifter(self.num_ceps, self.cepstral_lifter), requires_grad=False + ) if use_fft_mag: self._to_spec = _spectrogram else: @@ -597,25 +634,27 @@ def __init__(self, @staticmethod def make_lifter(N, Q): """Makes the liftering function - - Args: - N: Number of cepstral coefficients. - Q: Liftering parameter - Returns: - Liftering vector. + Args: + N: Number of cepstral coefficients. + Q: Liftering parameter + + Returns: + Liftering vector. """ if Q == 0: return 1 return 1 + 0.5 * Q * torch.sin( - math.pi * torch.arange(N, dtype=torch.get_default_dtype()) / Q) + math.pi * torch.arange(N, dtype=torch.get_default_dtype()) / Q + ) @staticmethod def make_dct_matrix(num_ceps, num_filters): n = torch.arange(float(num_filters)).unsqueeze(1) k = torch.arange(float(num_ceps)) - dct = torch.cos(math.pi / float(num_filters) * (n + 0.5) * - k) # size (n_mfcc, n_mels) + dct = torch.cos( + math.pi / float(num_filters) * (n + 0.5) * k + ) # size (n_mfcc, n_mels) dct[:, 0] *= 1.0 / math.sqrt(2.0) dct *= math.sqrt(2.0 / float(num_filters)) return dct @@ -626,7 +665,7 @@ def forward(self, x): if self.use_energy: x_strided, log_e = x_strided - #X = torch.rfft(x_strided, 1, normalized=False, onesided=True) + # X = torch.rfft(x_strided, 1, normalized=False, onesided=True) X = _rfft(x_strided) pow_spec = self._to_spec(X) # pow_spec = X.pow(2).sum(-1) @@ -649,57 +688,64 @@ def forward(self, x): class Wav2KanBayashiLogFilterBank(Wav2LogFilterBank): - """Class to replicate log-filter-banks used in - Kan Bayashi's ParallelWaveGAN repository: - https://github.com/kan-bayashi/ParallelWaveGAN + """Class to replicate log-filter-banks used in + Kan Bayashi's ParallelWaveGAN repository: + https://github.com/kan-bayashi/ParallelWaveGAN """ - def __init__(self, - fs=16000, - frame_length=64, - frame_shift=16, - fft_length=1024, - remove_dc_offset=True, - window_type='hanning', - low_freq=80, - high_freq=7600, - num_filters=80, - snip_edges=False, - center=True): - - super().__init__(fs=fs, - frame_length=frame_length, - frame_shift=frame_shift, - fft_length=fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=0, - window_type=window_type, - use_fft_mag=True, - dither=1e-5, - fb_type='mel_librosa', - low_freq=low_freq, - high_freq=high_freq, - num_filters=num_filters, - norm_filters=True, - snip_edges=snip_edges, - center=center, - use_energy=False) + + def __init__( + self, + fs=16000, + frame_length=64, + frame_shift=16, + fft_length=1024, + remove_dc_offset=True, + window_type="hanning", + low_freq=80, + high_freq=7600, + num_filters=80, + snip_edges=False, + center=True, + ): + + super().__init__( + fs=fs, + frame_length=frame_length, + frame_shift=frame_shift, + fft_length=fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=0, + window_type=window_type, + use_fft_mag=True, + dither=1e-5, + fb_type="mel_librosa", + low_freq=low_freq, + high_freq=high_freq, + num_filters=num_filters, + norm_filters=True, + snip_edges=snip_edges, + center=center, + use_energy=False, + ) # Kan Bayashi uses log10 instead of log - self.scale = 1. / math.log(10) + self.scale = 1.0 / math.log(10) def forward(self, x): return self.scale * super().forward(x) -class Spec2LogFilterBank(): - def __init__(self, - fs=16000, - fft_length=512, - fb_type='mel_kaldi', - low_freq=20, - high_freq=0, - num_filters=23, - norm_filters=False): +class Spec2LogFilterBank: + def __init__( + self, + fs=16000, + fft_length=512, + fb_type="mel_kaldi", + low_freq=20, + high_freq=0, + num_filters=23, + norm_filters=False, + ): super().__init__() self.fs = fs @@ -710,11 +756,18 @@ def __init__(self, self.num_filters = num_filters self.norm_filters = norm_filters - fb = FBF.create(fb_type, num_filters, self.fft_length, self.fs, - low_freq, high_freq, norm_filters) - self._fb = nn.Parameter(torch.tensor(fb, - dtype=torch.get_default_dtype()), - requires_grad=False) + fb = FBF.create( + fb_type, + num_filters, + self.fft_length, + self.fs, + low_freq, + high_freq, + norm_filters, + ) + self._fb = nn.Parameter( + torch.tensor(fb, dtype=torch.get_default_dtype()), requires_grad=False + ) def forward(self, x): with amp.autocast(enabled=False): diff --git a/hyperion/torch/layers/audio_feats_factory.py b/hyperion/torch/layers/audio_feats_factory.py index 1108647e..ac463f07 100644 --- a/hyperion/torch/layers/audio_feats_factory.py +++ b/hyperion/torch/layers/audio_feats_factory.py @@ -9,131 +9,143 @@ from ...feats.filter_banks import FilterBankFactory as FBF from .audio_feats import * -FFT = 'fft' -SPEC = 'spec' -LOG_SPEC = 'log_spec' -LOG_FB = 'logfb' -MFCC = 'mfcc' -KAN_BAYASHI = 'kanbayashi_logfb' +FFT = "fft" +SPEC = "spec" +LOG_SPEC = "log_spec" +LOG_FB = "logfb" +MFCC = "mfcc" +KAN_BAYASHI = "kanbayashi_logfb" FEAT_TYPES = [FFT, SPEC, LOG_SPEC, LOG_FB, MFCC, KAN_BAYASHI] class AudioFeatsFactory(object): @staticmethod - def create(audio_feat, - sample_frequency=16000, - frame_length=25, - frame_shift=10, - fft_length=512, - remove_dc_offset=True, - preemphasis_coeff=0.97, - window_type='povey', - use_fft_mag=False, - dither=1, - fb_type='mel_kaldi', - low_freq=20, - high_freq=0, - num_filters=23, - norm_filters=False, - num_ceps=13, - snip_edges=True, - center=False, - cepstral_lifter=22, - energy_floor=0, - raw_energy=True, - use_energy=True): + def create( + audio_feat, + sample_frequency=16000, + frame_length=25, + frame_shift=10, + fft_length=512, + remove_dc_offset=True, + preemphasis_coeff=0.97, + window_type="povey", + use_fft_mag=False, + dither=1, + fb_type="mel_kaldi", + low_freq=20, + high_freq=0, + num_filters=23, + norm_filters=False, + num_ceps=13, + snip_edges=True, + center=False, + cepstral_lifter=22, + energy_floor=0, + raw_energy=True, + use_energy=True, + ): if audio_feat == FFT: - return Wav2FFT(sample_frequency, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemphasis_coeff, - window_type=window_type, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + return Wav2FFT( + sample_frequency, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemphasis_coeff, + window_type=window_type, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) if audio_feat == SPEC: - return Wav2Spec(sample_frequency, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemphasis_coeff, - window_type=window_type, - use_fft_mag=use_fft_mag, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + return Wav2Spec( + sample_frequency, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemphasis_coeff, + window_type=window_type, + use_fft_mag=use_fft_mag, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) if audio_feat == LOG_SPEC: - return Wav2LogSpec(sample_frequency, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemphasis_coeff, - window_type=window_type, - use_fft_mag=use_fft_mag, - dither=dither, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + return Wav2LogSpec( + sample_frequency, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemphasis_coeff, + window_type=window_type, + use_fft_mag=use_fft_mag, + dither=dither, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) if audio_feat == LOG_FB: - return Wav2LogFilterBank(sample_frequency, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemphasis_coeff, - window_type=window_type, - use_fft_mag=use_fft_mag, - dither=dither, - fb_type=fb_type, - low_freq=low_freq, - high_freq=high_freq, - num_filters=num_filters, - norm_filters=norm_filters, - snip_edges=snip_edges, - center=center, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + return Wav2LogFilterBank( + sample_frequency, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemphasis_coeff, + window_type=window_type, + use_fft_mag=use_fft_mag, + dither=dither, + fb_type=fb_type, + low_freq=low_freq, + high_freq=high_freq, + num_filters=num_filters, + norm_filters=norm_filters, + snip_edges=snip_edges, + center=center, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) if audio_feat == MFCC: - return Wav2MFCC(sample_frequency, - frame_length, - frame_shift, - fft_length, - remove_dc_offset=remove_dc_offset, - preemph_coeff=preemphasis_coeff, - window_type=window_type, - use_fft_mag=use_fft_mag, - dither=dither, - fb_type=fb_type, - low_freq=low_freq, - high_freq=high_freq, - num_filters=num_filters, - norm_filters=norm_filters, - num_ceps=num_ceps, - snip_edges=snip_edges, - center=center, - cepstral_lifter=cepstral_lifter, - energy_floor=energy_floor, - raw_energy=raw_energy, - use_energy=use_energy) + return Wav2MFCC( + sample_frequency, + frame_length, + frame_shift, + fft_length, + remove_dc_offset=remove_dc_offset, + preemph_coeff=preemphasis_coeff, + window_type=window_type, + use_fft_mag=use_fft_mag, + dither=dither, + fb_type=fb_type, + low_freq=low_freq, + high_freq=high_freq, + num_filters=num_filters, + norm_filters=norm_filters, + num_ceps=num_ceps, + snip_edges=snip_edges, + center=center, + cepstral_lifter=cepstral_lifter, + energy_floor=energy_floor, + raw_energy=raw_energy, + use_energy=use_energy, + ) if audio_feat == KAN_BAYASHI: return Wav2KanBayashiLogFilterBank( @@ -146,25 +158,43 @@ def create(audio_feat, low_freq=low_freq, high_freq=high_freq, num_filters=num_filters, - snip_edges=snip_edges) + snip_edges=snip_edges, + ) @staticmethod def filter_args(**kwargs): """Filters MFCC args from arguments dictionary. - - Args: - kwargs: Arguments dictionary. - - Returns: - Dictionary with MFCC options. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with MFCC options. """ - valid_args = ('sample_frequency', 'frame_length', 'frame_shift', - 'fft_length', 'remove_dc_offset', 'preemphasis_coeff', - 'window_type', 'blackman_coeff', 'use_fft_mag', 'dither', - 'fb_type', 'low_freq', 'high_freq', 'num_filters', - 'norm_filters', 'num_ceps', 'snip_edges', 'energy_floor', - 'raw_energy', 'use_energy', 'cepstral_lifter', - 'audio_feat') + valid_args = ( + "sample_frequency", + "frame_length", + "frame_shift", + "fft_length", + "remove_dc_offset", + "preemphasis_coeff", + "window_type", + "blackman_coeff", + "use_fft_mag", + "dither", + "fb_type", + "low_freq", + "high_freq", + "num_filters", + "norm_filters", + "num_ceps", + "snip_edges", + "energy_floor", + "raw_energy", + "use_energy", + "cepstral_lifter", + "audio_feat", + ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return d @@ -172,121 +202,141 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): """Adds MFCC options to parser. - - Args: - parser: Arguments parser - prefix: Options prefix. + + Args: + parser: Arguments parser + prefix: Options prefix. """ if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--sample-frequency', + "--sample-frequency", default=16000, type=int, help=( - 'Waveform data sample frequency (must match the waveform file, ' - 'if specified there)')) - - parser.add_argument('--frame-length', - type=int, - default=25, - help='Frame length in milliseconds') - parser.add_argument('--frame-shift', - type=int, - default=10, - help='Frame shift in milliseconds') - parser.add_argument('--fft-length', - type=int, - default=512, - help='Length of FFT') - - parser.add_argument('--remove-dc-offset', - default=True, - type=str2bool, - help='Subtract mean from waveform on each frame') - - parser.add_argument('--preemphasis-coeff', - type=float, - default=0.97, - help='Coefficient for use in signal preemphasis') + "Waveform data sample frequency (must match the waveform file, " + "if specified there)" + ), + ) + + parser.add_argument( + "--frame-length", type=int, default=25, help="Frame length in milliseconds" + ) + parser.add_argument( + "--frame-shift", type=int, default=10, help="Frame shift in milliseconds" + ) + parser.add_argument("--fft-length", type=int, default=512, help="Length of FFT") parser.add_argument( - '--window-type', - default='povey', - choices=['hamming', 'hanning', 'povey', 'rectangular', 'blackman'], - help=('Type of window ("hamming"|"hanning"|"povey"|' - '"rectangular"|"blackmann")')) + "--remove-dc-offset", + default=True, + type=str2bool, + help="Subtract mean from waveform on each frame", + ) + + parser.add_argument( + "--preemphasis-coeff", + type=float, + default=0.97, + help="Coefficient for use in signal preemphasis", + ) + + parser.add_argument( + "--window-type", + default="povey", + choices=["hamming", "hanning", "povey", "rectangular", "blackman"], + help=( + 'Type of window ("hamming"|"hanning"|"povey"|' + '"rectangular"|"blackmann")' + ), + ) parser.add_argument( - '--use-fft-mag', + "--use-fft-mag", default=False, - action='store_true', - help='If true, it uses |X(f)|, if false, it uses |X(f)|^2') + action="store_true", + help="If true, it uses |X(f)|, if false, it uses |X(f)|^2", + ) - parser.add_argument('--dither', - type=float, - default=1, - help='Dithering constant (0.0 means no dither)') + parser.add_argument( + "--dither", + type=float, + default=1, + help="Dithering constant (0.0 means no dither)", + ) FBF.add_class_args(parser) parser.add_argument( - '--num-ceps', + "--num-ceps", type=int, default=13, - help='Number of cepstra in MFCC computation (including C0)') + help="Number of cepstra in MFCC computation (including C0)", + ) parser.add_argument( - '--snip-edges', + "--snip-edges", default=True, type=str2bool, - help=('If true, end effects will be handled by outputting only ' - 'frames that completely fit in the file, and the number of ' - 'frames depends on the frame-length. If false, the number ' - 'of frames depends only on the frame-shift, ' - 'and we reflect the data at the ends.')) + help=( + "If true, end effects will be handled by outputting only " + "frames that completely fit in the file, and the number of " + "frames depends on the frame-length. If false, the number " + "of frames depends only on the frame-shift, " + "and we reflect the data at the ends." + ), + ) parser.add_argument( - '--center', + "--center", default=False, type=str2bool, - help=('If true, puts the center of the frame at t*frame_shift, ' - 'it over-wrides snip-edges and set it to false')) + help=( + "If true, puts the center of the frame at t*frame_shift, " + "it over-wrides snip-edges and set it to false" + ), + ) parser.add_argument( - '--energy-floor', + "--energy-floor", type=float, default=0, - help='Floor on energy (absolute, not relative) in MFCC computation' + help="Floor on energy (absolute, not relative) in MFCC computation", ) parser.add_argument( - '--raw-energy', + "--raw-energy", default=True, type=str2bool, - help='If true, compute energy before preemphasis and windowing') - parser.add_argument('--use-energy', - default=True, - type=str2bool, - help='Use energy (not C0) in MFCC computation') + help="If true, compute energy before preemphasis and windowing", + ) + parser.add_argument( + "--use-energy", + default=True, + type=str2bool, + help="Use energy (not C0) in MFCC computation", + ) - parser.add_argument('--cepstral-lifter', - type=float, - default=22, - help='Constant that controls scaling of MFCCs') + parser.add_argument( + "--cepstral-lifter", + type=float, + default=22, + help="Constant that controls scaling of MFCCs", + ) parser.add_argument( - '--audio-feat', - default='cepstrum', + "--audio-feat", + default="cepstrum", choices=FEAT_TYPES, - help=('It can return intermediate result: fft, spec, log_spec, ' - 'logfb, mfcc')) + help=( + "It can return intermediate result: fft, spec, log_spec, " "logfb, mfcc" + ), + ) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='acoustic features options') add_argparse_args = add_class_args diff --git a/hyperion/torch/layers/calibrators.py b/hyperion/torch/layers/calibrators.py index 18d69e4b..4b38a858 100644 --- a/hyperion/torch/layers/calibrators.py +++ b/hyperion/torch/layers/calibrators.py @@ -6,12 +6,12 @@ import torch import torch.nn as nn -class LinBinCalibrator(nn.Module): +class LinBinCalibrator(nn.Module): def __init__(self, a, b): super().__init__() self.a = a self.b = b def forward(self, x): - return self.a*x+self.b + return self.a * x + self.b diff --git a/hyperion/torch/layers/margin_losses.py b/hyperion/torch/layers/margin_losses.py index c6098f04..795172a6 100644 --- a/hyperion/torch/layers/margin_losses.py +++ b/hyperion/torch/layers/margin_losses.py @@ -11,6 +11,7 @@ import torch.nn as nn import torch.cuda.amp as amp + def _l2_norm(x, axis=-1): with amp.autocast(enabled=False): norm = torch.norm(x.float(), 2, axis, True) + 1e-10 @@ -19,7 +20,6 @@ def _l2_norm(x, axis=-1): class ArcLossOutput(nn.Module): - def __init__(self, in_feats, num_classes, s=64, margin=0.3, margin_warmup_epochs=0): super().__init__() self.in_feats = in_feats @@ -31,37 +31,38 @@ def __init__(self, in_feats, num_classes, s=64, margin=0.3, margin_warmup_epochs self.cur_margin = margin else: self.cur_margin = 0 - + self._compute_aux() self.kernel = nn.Parameter(torch.Tensor(in_feats, num_classes)) - self.kernel.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5) - + self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) def __repr__(self): return self.__str__() def __str__(self): - s = '%s(in_feats=%d, num_classes=%d, s=%.2f, margin=%.2f, margin_warmup_epochs=%d)' % ( + s = "%s(in_feats=%d, num_classes=%d, s=%.2f, margin=%.2f, margin_warmup_epochs=%d)" % ( self.__class__.__name__, - self.in_feats, self.num_classes, - self.s, self.margin, self.margin_warmup_epochs) + self.in_feats, + self.num_classes, + self.s, + self.margin, + self.margin_warmup_epochs, + ) return s - def _compute_aux(self): - logging.info('updating arc-softmax margin=%.2f' % (self.cur_margin)) + logging.info("updating arc-softmax margin=%.2f" % (self.cur_margin)) self.cos_m = math.cos(self.cur_margin) self.sin_m = math.sin(self.cur_margin) - def update_margin(self, epoch): - + if self.margin_warmup_epochs == 0: return if epoch < self.margin_warmup_epochs: - self.cur_margin = self.margin*epoch/self.margin_warmup_epochs + self.cur_margin = self.margin * epoch / self.margin_warmup_epochs else: if self.cur_margin != self.margin: self.cur_margin = self.margin @@ -70,33 +71,32 @@ def update_margin(self, epoch): self._compute_aux() - def forward(self, x, y=None): with amp.autocast(enabled=False): s = self.s batch_size = len(x) x = _l2_norm(x.float()) kernel_norm = _l2_norm(self.kernel, axis=0) - # cos(theta+m) + # cos(theta+m) cos_theta = torch.mm(x, kernel_norm).float() - cos_theta = cos_theta.clamp(-1,1) # for numerical stability - #print(cos_theta) - output = cos_theta * 1.0 # a little bit hacky way to prevent in_place operation on cos_theta + cos_theta = cos_theta.clamp(-1, 1) # for numerical stability + # print(cos_theta) + output = ( + cos_theta * 1.0 + ) # a little bit hacky way to prevent in_place operation on cos_theta if y is not None and self.training: cos_theta_2 = torch.pow(cos_theta, 2) sin_theta_2 = (1 + 1e-10) - cos_theta_2 sin_theta = torch.sqrt(sin_theta_2) - cos_theta_m = (cos_theta * self.cos_m - sin_theta * self.sin_m) - + cos_theta_m = cos_theta * self.cos_m - sin_theta * self.sin_m + idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] - output *= s # scale up in order to make softmax work + output *= s # scale up in order to make softmax work return output - - # @amp.float_function # def forward(self, x, y=None): @@ -109,7 +109,7 @@ def forward(self, x, y=None): # logging.info('xn={}'.format(str(x[9]))) # batch_size = len(x) # kernel_norm = _l2_norm(self.kernel, axis=0) - # # cos(theta+m) + # # cos(theta+m) # cos_theta = torch.mm(x, kernel_norm).float() # cos_theta = cos_theta.clamp(-1,1) # for numerical stability # #print(cos_theta) @@ -135,9 +135,7 @@ def forward(self, x, y=None): # return output - class CosLossOutput(nn.Module): - def __init__(self, in_feats, num_classes, s=64, margin=0.3, margin_warmup_epochs=0): super().__init__() self.in_feats = in_feats @@ -149,86 +147,101 @@ def __init__(self, in_feats, num_classes, s=64, margin=0.3, margin_warmup_epochs self.cur_margin = margin else: self.cur_margin = 0 - + self.kernel = nn.Parameter(torch.Tensor(in_feats, num_classes)) - self.kernel.data.uniform_(-1, 1).renorm_(2,1,1e-5).mul_(1e5) - + self.kernel.data.uniform_(-1, 1).renorm_(2, 1, 1e-5).mul_(1e5) def update_margin(self, epoch): - + if self.margin_warmup_epochs == 0: return if epoch < self.margin_warmup_epochs: - self.cur_margin = self.margin*epoch/self.margin_warmup_epochs - logging.info('updating cos-softmax margin=%.2f' % (self.cur_margin)) + self.cur_margin = self.margin * epoch / self.margin_warmup_epochs + logging.info("updating cos-softmax margin=%.2f" % (self.cur_margin)) else: if self.cur_margin != self.margin: self.cur_margin = self.margin - logging.info('updating cos-softmax margin=%.2f' % (self.cur_margin)) + logging.info("updating cos-softmax margin=%.2f" % (self.cur_margin)) else: return - def forward(self, x, y=None): with amp.autocast(enabled=False): s = self.s x = _l2_norm(x.float()) batch_size = len(x) - kernel_norm = _l2_norm(self.kernel,axis=0) - # cos(theta+m) + kernel_norm = _l2_norm(self.kernel, axis=0) + # cos(theta+m) cos_theta = torch.mm(x, kernel_norm).float() - cos_theta = cos_theta.clamp(-1,1) # for numerical stability + cos_theta = cos_theta.clamp(-1, 1) # for numerical stability - output = cos_theta * 1.0 # a little bit hacky way to prevent in_place operation on cos_theta + output = ( + cos_theta * 1.0 + ) # a little bit hacky way to prevent in_place operation on cos_theta if y is not None and self.training: cos_theta_m = cos_theta - self.cur_margin idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] - output *= s # scale up in order to make softmax work + output *= s # scale up in order to make softmax work return output class SubCenterArcLossOutput(ArcLossOutput): - - def __init__(self, in_feats, num_classes, num_subcenters=2, s=64, margin=0.3, margin_warmup_epochs=0): - super().__init__(in_feats, num_classes * num_subcenters, s, margin, margin_warmup_epochs) + def __init__( + self, + in_feats, + num_classes, + num_subcenters=2, + s=64, + margin=0.3, + margin_warmup_epochs=0, + ): + super().__init__( + in_feats, num_classes * num_subcenters, s, margin, margin_warmup_epochs + ) self.num_classes = num_classes self.num_subcenters = num_subcenters - def __str__(self): - s = '%s(in_feats=%d, num_classes=%d, num_subcenters=%d, s=%.2f, margin=%.2f, margin_warmup_epochs=%d)' % ( + s = "%s(in_feats=%d, num_classes=%d, num_subcenters=%d, s=%.2f, margin=%.2f, margin_warmup_epochs=%d)" % ( self.__class__.__name__, - self.in_feats, self.num_classes, self.num_subcenters, - self.s, self.margin, self.margin_warmup_epochs) + self.in_feats, + self.num_classes, + self.num_subcenters, + self.s, + self.margin, + self.margin_warmup_epochs, + ) return s - def forward(self, x, y=None): with amp.autocast(enabled=False): s = self.s batch_size = len(x) x = _l2_norm(x.float()) kernel_norm = _l2_norm(self.kernel, axis=0) - # cos(theta+m) + # cos(theta+m) cos_theta = torch.mm(x, kernel_norm).float() cos_theta = torch.max( - cos_theta.view(-1, self.num_classes, self.num_subcenters), dim=-1)[0] - - cos_theta = cos_theta.clamp(-1, 1) # for numerical stability - #print(cos_theta) - output = cos_theta * 1.0 # a little bit hacky way to prevent in_place operation on cos_theta + cos_theta.view(-1, self.num_classes, self.num_subcenters), dim=-1 + )[0] + + cos_theta = cos_theta.clamp(-1, 1) # for numerical stability + # print(cos_theta) + output = ( + cos_theta * 1.0 + ) # a little bit hacky way to prevent in_place operation on cos_theta if y is not None and self.training: cos_theta_2 = torch.pow(cos_theta, 2) sin_theta_2 = (1 + 1e-10) - cos_theta_2 sin_theta = torch.sqrt(sin_theta_2) - cos_theta_m = (cos_theta * self.cos_m - sin_theta * self.sin_m) - + cos_theta_m = cos_theta * self.cos_m - sin_theta * self.sin_m + idx_ = torch.arange(0, batch_size, dtype=torch.long) output[idx_, y] = cos_theta_m[idx_, y] - output *= s # scale up in order to make softmax work + output *= s # scale up in order to make softmax work return output diff --git a/hyperion/torch/layers/mvn.py b/hyperion/torch/layers/mvn.py index e5bedf23..3ee1e121 100644 --- a/hyperion/torch/layers/mvn.py +++ b/hyperion/torch/layers/mvn.py @@ -9,12 +9,9 @@ class MeanVarianceNorm(nn.Module): - def __init__(self, - norm_mean=True, - norm_var=False, - left_context=0, - right_context=0, - dim=1): + def __init__( + self, norm_mean=True, norm_var=False, left_context=0, right_context=0, dim=1 + ): super(MeanVarianceNorm, self).__init__() self.norm_mean = norm_mean @@ -27,16 +24,22 @@ def __repr__(self): return self.__str__() def __str__(self): - s = '{}(norm_mean={}, norm_var={}, left_context={}, right_context={}, dim={})'.format( - self.__class__.__name__, self.norm_mean, self.norm_var, - self.left_context, self.right_context, self.dim) + s = "{}(norm_mean={}, norm_var={}, left_context={}, right_context={}, dim={})".format( + self.__class__.__name__, + self.norm_mean, + self.norm_var, + self.left_context, + self.right_context, + self.dim, + ) return s def forward(self, x): T = x.shape[self.dim] - if (self.left_context == 0 and self.right_context - == 0) or (T <= self.left_context + self.right_context + 1): + if (self.left_context == 0 and self.right_context == 0) or ( + T <= self.left_context + self.right_context + 1 + ): return self.normalize_global(x) return self.normalize_cumsum(x) @@ -56,8 +59,8 @@ def normalize_global(self, x): def normalize_cumsum(self, x): if self.norm_mean: - #substract first global mean - #it will help cumsum numerical stability + # substract first global mean + # it will help cumsum numerical stability m_x = torch.mean(x, dim=self.dim, keepdim=True) x = x - m_x @@ -66,25 +69,27 @@ def normalize_cumsum(self, x): total_context = self.left_context + self.right_context + 1 - xx = nn.functional.pad(x.transpose(1, -1), - (self.left_context, self.right_context), - mode='reflect').transpose(1, -1) + xx = nn.functional.pad( + x.transpose(1, -1), (self.left_context, self.right_context), mode="reflect" + ).transpose(1, -1) if self.norm_mean: c_x = torch.cumsum(xx, dim=1) - m_x = (c_x[:, total_context - 1:] - - c_x[:, :-total_context + 1]) / total_context + m_x = ( + c_x[:, total_context - 1 :] - c_x[:, : -total_context + 1] + ) / total_context if self.norm_var: - c_x = torch.cumsum(xx**2, dim=1) - m_x2 = (c_x[:, total_context - 1:] - - c_x[:, :-total_context + 1]) / total_context + c_x = torch.cumsum(xx ** 2, dim=1) + m_x2 = ( + c_x[:, total_context - 1 :] - c_x[:, : -total_context + 1] + ) / total_context if self.norm_mean: x = x - m_x if self.norm_var: - s_x = torch.sqrt((m_x2 - m_x**2).clamp(min=1e-5)) + s_x = torch.sqrt((m_x2 - m_x ** 2).clamp(min=1e-5)) x = x / s_x if self.dim != 1: @@ -95,72 +100,88 @@ def normalize_cumsum(self, x): @staticmethod def filter_args(**kwargs): """Filters ST-CMVN args from arguments dictionary. - - Args: - kwargs: Arguments dictionary. - - Returns: - Dictionary with ST-CMVN options. + + Args: + kwargs: Arguments dictionary. + + Returns: + Dictionary with ST-CMVN options. """ - valid_args = ('no_norm_mean', 'norm_mean', 'norm_var', 'left_context', - 'right_context', 'context') + valid_args = ( + "no_norm_mean", + "norm_mean", + "norm_var", + "left_context", + "right_context", + "context", + ) d = dict((k, kwargs[k]) for k in valid_args if k in kwargs) - if 'no_norm_mean' in d: - d['norm_mean'] = not d['no_norm_mean'] - del d['no_norm_mean'] + if "no_norm_mean" in d: + d["norm_mean"] = not d["no_norm_mean"] + del d["no_norm_mean"] - if 'context' in d: - if d['context'] is not None: - d['left_context'] = d['context'] - d['right_context'] = d['context'] - del d['context'] + if "context" in d: + if d["context"] is not None: + d["left_context"] = d["context"] + d["right_context"] = d["context"] + del d["context"] return d @staticmethod def add_class_args(parser, prefix=None): """Adds ST-CMVN options to parser. - - Args: - parser: Arguments parser - prefix: Options prefix. + + Args: + parser: Arguments parser + prefix: Options prefix. """ if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") - parser.add_argument('--no-norm-mean', - default=False, - action='store_true', - help='don\'t center the features') + parser.add_argument( + "--no-norm-mean", + default=False, + action="store_true", + help="don't center the features", + ) - parser.add_argument('--norm-var', - default=False, - action='store_true', - help='normalize the variance of the features') + parser.add_argument( + "--norm-var", + default=False, + action="store_true", + help="normalize the variance of the features", + ) - parser.add_argument('--left-context', - type=int, - default=150, - help='past context in number of frames') + parser.add_argument( + "--left-context", + type=int, + default=150, + help="past context in number of frames", + ) - parser.add_argument('--right-context', - type=int, - default=150, - help='future context in number of frames') + parser.add_argument( + "--right-context", + type=int, + default=150, + help="future context in number of frames", + ) parser.add_argument( - '--context', + "--context", type=int, default=None, - help=('past/future context in number of frames, ' - 'overwrites left-context and right-context options')) + help=( + "past/future context in number of frames, " + "overwrites left-context and right-context options" + ), + ) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='mean-var norm. options') add_argparse_args = add_class_args diff --git a/hyperion/torch/layers/norm_layer_factory.py b/hyperion/torch/layers/norm_layer_factory.py index b47e173a..cd7e542f 100644 --- a/hyperion/torch/layers/norm_layer_factory.py +++ b/hyperion/torch/layers/norm_layer_factory.py @@ -7,14 +7,12 @@ class NormLayer2dFactory(object): - - @staticmethod def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): - """ Creates a layer-norm callabe constructor + """Creates a layer-norm callabe constructor Args: - norm_name: str with normalization layer name, + norm_name: str with normalization layer name, in [batch-norm, group-norm, instance-norm, instance-norm-affine, layer-norm ] num_groups: num_groups for group-norm @@ -24,47 +22,44 @@ def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): Returns: Callable contructor to crate layer-norm layers """ - + # if None we assume batch-norm - if norm_name is None or norm_name == 'batch-norm': + if norm_name is None or norm_name == "batch-norm": return lambda x, momentum=momentum, eps=eps: nn.BatchNorm2d( - x, momentum=momentum, eps=eps) + x, momentum=momentum, eps=eps + ) if not isinstance(norm_name, str): # we assume that this is already a layernorm object # and return unchanged return norm_name - - if norm_name == 'group-norm': + if norm_name == "group-norm": num_groups = 32 if num_groups is None else num_groups return lambda x, momentum=momentum, eps=eps: nn.GroupNorm( - num_groups, x, eps=eps) + num_groups, x, eps=eps + ) - if norm_name == 'instance-norm': - return lambda x, momentum=momentum, eps=eps: nn.InstanceNorm2d( - x, eps=eps) + if norm_name == "instance-norm": + return lambda x, momentum=momentum, eps=eps: nn.InstanceNorm2d(x, eps=eps) - if norm_name == 'instance-norm-affine': + if norm_name == "instance-norm-affine": return lambda x, momentum=momentum, eps=eps: nn.InstanceNorm2d( - x, eps=eps, affine=True) + x, eps=eps, affine=True + ) - if norm_name == 'layer-norm': + if norm_name == "layer-norm": # it is equivalent to groupnorm with 1 group - return lambda x, momentum=momentum, eps=eps: nn.GroupNorm( - 1, x, eps=eps) - + return lambda x, momentum=momentum, eps=eps: nn.GroupNorm(1, x, eps=eps) class NormLayer1dFactory(object): - - @staticmethod def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): - """ Creates a layer-norm callabe constructor + """Creates a layer-norm callabe constructor Args: - norm_name: str with normalization layer name, + norm_name: str with normalization layer name, in [batch-norm, group-norm, instance-norm, instance-norm-affine, layer-norm ] num_groups: num_groups for group-norm @@ -74,32 +69,32 @@ def create(norm_name, num_groups=None, momentum=0.1, eps=1e-5): Returns: Callable contructor to crate layer-norm layers """ - + # if None we assume batch-norm - if norm_name is None or norm_name == 'batch-norm': + if norm_name is None or norm_name == "batch-norm": return lambda x, momentum=momentum, eps=eps: nn.BatchNorm1d( - x, momentum=momentum, eps=eps) + x, momentum=momentum, eps=eps + ) if not isinstance(norm_name, str): # we assume that this is already a layernorm object # and return unchanged return norm_name - if norm_name == 'group-norm': + if norm_name == "group-norm": num_groups = 32 if num_groups is None else num_groups return lambda x, momentum=momentum, eps=eps: nn.GroupNorm( - num_groups, x, eps=eps) + num_groups, x, eps=eps + ) - if norm_name == 'instance-norm': - return lambda x, momentum=momentum, eps=eps: nn.InstanceNorm1d( - x, eps=eps) + if norm_name == "instance-norm": + return lambda x, momentum=momentum, eps=eps: nn.InstanceNorm1d(x, eps=eps) - if norm_name == 'instance-norm-affine': + if norm_name == "instance-norm-affine": return lambda x, momentum=momentum, eps=eps: nn.InstanceNorm1d( - x, eps=eps, affine=True) + x, eps=eps, affine=True + ) - if norm_name == 'layer-norm': + if norm_name == "layer-norm": # it is equivalent to groupnorm with 1 group - return lambda x, momentum=momentum, eps=eps: nn.GroupNorm( - 1, x, eps=eps) - + return lambda x, momentum=momentum, eps=eps: nn.GroupNorm(1, x, eps=eps) diff --git a/hyperion/torch/layers/pdf_storage.py b/hyperion/torch/layers/pdf_storage.py index 72731457..bac48d27 100644 --- a/hyperion/torch/layers/pdf_storage.py +++ b/hyperion/torch/layers/pdf_storage.py @@ -10,15 +10,14 @@ class StdNormal(nn.Module): - """Storage for Standard Normal distribution - """ + """Storage for Standard Normal distribution""" + def __init__(self, shape): super().__init__() - self.register_buffer('loc', torch.zeros(shape)) - self.register_buffer('scale', torch.ones(shape)) - #self.loc = nn.Parameter(torch.zeros(shape), requires_grad=False) - #self.scale = nn.Parameter(torch.ones(shape), requires_grad=False) - + self.register_buffer("loc", torch.zeros(shape)) + self.register_buffer("scale", torch.ones(shape)) + # self.loc = nn.Parameter(torch.zeros(shape), requires_grad=False) + # self.scale = nn.Parameter(torch.ones(shape), requires_grad=False) @property def pdf(self): @@ -26,4 +25,3 @@ def pdf(self): def forward(self): return self.pdf - diff --git a/hyperion/torch/layers/pos_encoder.py b/hyperion/torch/layers/pos_encoder.py index 1d587d75..f3aa17e9 100644 --- a/hyperion/torch/layers/pos_encoder.py +++ b/hyperion/torch/layers/pos_encoder.py @@ -25,17 +25,15 @@ def __init__(self, num_feats, dropout_rate=0): self.dropout = torch.nn.Dropout(p=dropout_rate) self.pe = None - def __repr__(self): return self.__str__() - def __str__(self): - s = '{}(num_feats={}, dropout_rate={})'.format( - self.__class__.__name__, self.num_feats, self.dropout_rate) + s = "{}(num_feats={}, dropout_rate={})".format( + self.__class__.__name__, self.num_feats, self.dropout_rate + ) return s - def _pe(self, x, relative=False): """Reset the positional encodings.""" if self.pe is not None: @@ -47,18 +45,21 @@ def _pe(self, x, relative=False): pe = torch.zeros(x.size(1), self.num_feats) if relative: # this is for relative positional encoders - position = torch.arange(x.size(1)-1, -1, -1, dtype=torch.float32).unsqueeze(1) + position = torch.arange( + x.size(1) - 1, -1, -1, dtype=torch.float32 + ).unsqueeze(1) else: position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1) - div_term = torch.exp(torch.arange(0, self.num_feats, 2, dtype=torch.float32) * - -(math.log(10000.0) / self.num_feats)) + div_term = torch.exp( + torch.arange(0, self.num_feats, 2, dtype=torch.float32) + * -(math.log(10000.0) / self.num_feats) + ) pe[:, 0::2] = torch.sin(position * div_term) pe[:, 1::2] = torch.cos(position * div_term) pe = pe.unsqueeze(0) self.pe = pe.to(device=x.device, dtype=x.dtype) return self.pe - def forward(self, x): """Add positional encoding. @@ -69,13 +70,12 @@ def forward(self, x): x-scaled + pos-encoder """ pe = self._pe(x) - x = x * self.xscale + pe[:,:x.size(1)] + x = x * self.xscale + pe[:, : x.size(1)] if self.dropout_rate > 0: return self.dropout(x) return x - class RelPosEncoder(PosEncoder): """Relative Positional encoding as defined in https://arxiv.org/pdf/1901.02860.pdf @@ -87,10 +87,10 @@ class RelPosEncoder(PosEncoder): num_feats: embedding dim dropout_rate: dropout rate """ + def __init__(self, num_feats, dropout_rate=0): super().__init__(num_feats, dropout_rate) - def forward(self, x): """Add positional encoding. @@ -105,8 +105,8 @@ def forward(self, x): x = x * self.xscale # we want embedding [R_L,..., R_0] # while in non relative we want [R_0, ..., R_L] - pos_emb = self.pe[:,-x.size(1):] - # this pos_emb is matrix Q in + pos_emb = self.pe[:, -x.size(1) :] + # this pos_emb is matrix Q in # https://arxiv.org/pdf/1901.02860.pdf Appendix B # I think it should have been denoted as R, # probably a typo in the paper @@ -119,13 +119,13 @@ def forward(self, x): class NoPosEncoder(nn.Module): """This is a dummy class for the case where we - deactivate the positional encoder + deactivate the positional encoder """ + def __init__(self): super().__init__() - def forward(self, x): """Identity map @@ -136,4 +136,3 @@ def forward(self, x): x """ return x - diff --git a/hyperion/torch/layers/subpixel_convs.py b/hyperion/torch/layers/subpixel_convs.py index 87ee16d5..6b529aff 100644 --- a/hyperion/torch/layers/subpixel_convs.py +++ b/hyperion/torch/layers/subpixel_convs.py @@ -7,15 +7,33 @@ import torch import torch.nn as nn -class SubPixelConv1d(nn.Module): - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, - dilation=1, groups=1, bias=True, padding_mode='zeros'): +class SubPixelConv1d(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + ): super().__init__() - self.conv = nn.Conv1d(in_channels, stride*out_channels, kernel_size, stride=1, - padding=padding, dilation=dilation, - groups=groups, bias=bias, padding_mode=padding_mode) - + self.conv = nn.Conv1d( + in_channels, + stride * out_channels, + kernel_size, + stride=1, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode, + ) + self.out_channels = out_channels self.stride = stride @@ -24,26 +42,44 @@ def forward(self, x): if self.stride == 1: return x - x = x.view(-1, self.stride, self.out_channels, x.size(-1)).permute( - 0,2,3,1).reshape(-1, self.out_channels, x.size(-1)*self.stride) + x = ( + x.view(-1, self.stride, self.out_channels, x.size(-1)) + .permute(0, 2, 3, 1) + .reshape(-1, self.out_channels, x.size(-1) * self.stride) + ) return x - class SubPixelConv2d(nn.Module): - - def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0, - dilation=1, groups=1, bias=True, padding_mode='zeros'): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=True, + padding_mode="zeros", + ): super().__init__() - self.conv = nn.Conv2d(in_channels, (stride**2)*out_channels, kernel_size, stride=1, - padding=padding, dilation=dilation, - groups=groups, bias=bias, padding_mode=padding_mode) - + self.conv = nn.Conv2d( + in_channels, + (stride ** 2) * out_channels, + kernel_size, + stride=1, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + padding_mode=padding_mode, + ) + self.stride = stride if stride > 1: self.pixel_shuffle = nn.PixelShuffle(self.stride) - def forward(self, x): x = self.conv(x) if self.stride == 1: @@ -52,11 +88,10 @@ def forward(self, x): return self.pixel_shuffle(x) - def ICNR2d(tensor, stride=2, initializer=nn.init.kaiming_normal): - """Initialization method + """Initialization method "Initialization to Convolution Nearest neighbours Resize (ICNR)" - for subpixel convolutions described in + for subpixel convolutions described in described in "Andrew Aitken et al. (2017) Checkerboard artifact free sub-pixel convolution" https://arxiv.org/abs/1707.02937 @@ -74,21 +109,19 @@ def ICNR2d(tensor, stride=2, initializer=nn.init.kaiming_normal): subkernel = torch.zeros(new_shape) subkernel = initializer(subkernel) subkernel = subkernel.transpose(0, 1).contiguous() - subkernel = subkernel.view( - subkernel.shape[0], subkernel.shape[1], -1) - + subkernel = subkernel.view(subkernel.shape[0], subkernel.shape[1], -1) + kernel = subkernel.repeat(1, 1, stride ** 2) - + transposed_shape = [tensor.shape[1], tensor.shape[0]] + list(tensor.shape[2:]) - kernel = kernel.contiguous().view(transposed_shape).transpose( - 0, 1).contiguous() + kernel = kernel.contiguous().view(transposed_shape).transpose(0, 1).contiguous() tensor.copy_(kernel) def ICNR1d(tensor, stride=2, initializer=nn.init.kaiming_normal): - """1d version of the initialization method + """1d version of the initialization method "Initialization to Convolution Nearest neighbours Resize (ICNR)" - for subpixel convolutions described in + for subpixel convolutions described in described in "Andrew Aitken et al. (2017) Checkerboard artifact free sub-pixel convolution" https://arxiv.org/abs/1707.02937 @@ -106,13 +139,10 @@ def ICNR1d(tensor, stride=2, initializer=nn.init.kaiming_normal): subkernel = torch.zeros(new_shape) subkernel = initializer(subkernel) subkernel = subkernel.transpose(0, 1).contiguous() - subkernel = subkernel.view( - subkernel.shape[0], subkernel.shape[1], -1) + subkernel = subkernel.view(subkernel.shape[0], subkernel.shape[1], -1) kernel = subkernel.repeat(1, 1, stride) transposed_shape = (tensor.shape[1], tensor.shape[0], tensor.shape[2]) - kernel = kernel.contiguous().view(transposed_shape).transpose( - 0, 1).contiguous() + kernel = kernel.contiguous().view(transposed_shape).transpose(0, 1).contiguous() tensor.copy_(kernel) - diff --git a/hyperion/torch/layers/swish.py b/hyperion/torch/layers/swish.py index 70779ca1..520a71fb 100644 --- a/hyperion/torch/layers/swish.py +++ b/hyperion/torch/layers/swish.py @@ -5,6 +5,7 @@ import torch import torch.nn as nn + class SwishImplementation(torch.autograd.Function): @staticmethod def forward(ctx, i): @@ -27,6 +28,5 @@ def __repr__(self): return self.__str__() def __str__(self): - s = '{}()'.format(self.__class__.__name__) + s = "{}()".format(self.__class__.__name__) return s - diff --git a/hyperion/torch/layers/tensor2pdf.py b/hyperion/torch/layers/tensor2pdf.py index 925b5a0a..e38b1bc7 100644 --- a/hyperion/torch/layers/tensor2pdf.py +++ b/hyperion/torch/layers/tensor2pdf.py @@ -9,24 +9,27 @@ import torch.nn.functional as nnf import torch.distributions as pdf + class Tensor2PDF(nn.Module): """Base class for layers that create a prob distribution - from an input tensor + from an input tensor """ + def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): super().__init__() self.pdf_feats = pdf_feats self.project = project if project: - assert in_feats is not None, ( - 'input channels must be given to make the projection') - assert in_dim is not None, ( - 'input tensor dim must be given to make the projection') + assert ( + in_feats is not None + ), "input channels must be given to make the projection" + assert ( + in_dim is not None + ), "input tensor dim must be given to make the projection" self.in_feats = in_feats self.in_dim = in_dim - def _make_proj(self, in_feats, out_feats, ndims): if ndims == 2: return nn.Linear(in_feats, out_feats) @@ -37,23 +40,17 @@ def _make_proj(self, in_feats, out_feats, ndims): elif ndims == 5: return nn.Conv3d(in_feats, out_feats, kernel_size=1) else: - raise ValueError('ndim=%d is not supported' % ndims) - - + raise ValueError("ndim=%d is not supported" % ndims) class Tensor2NormalICov(Tensor2PDF): - """Transforms a Tensor into Normal distribution with identitiy variance - - """ + """Transforms a Tensor into Normal distribution with identitiy variance""" + def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): - super().__init__( - pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) + super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) if self.project: - self._proj = self._make_proj( - self.in_feats, self.pdf_feats, self.in_dim) - + self._proj = self._make_proj(self.in_feats, self.pdf_feats, self.in_dim) def forward(self, inputs, prior=None, squeeze_dim=None): if self.project: @@ -68,38 +65,34 @@ def forward(self, inputs, prior=None, squeeze_dim=None): return pdf.normal.Normal(loc, scale) - class Tensor2NormalGlobDiagCov(Tensor2PDF): """Transforms a Tensor into Normal distribution - - Input tensor will be the mean of the distribution and - the standard deviation is a global trainable parameter. + + Input tensor will be the mean of the distribution and + the standard deviation is a global trainable parameter. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): - super().__init__( - pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) + super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) if self.project: - self._proj = self._make_proj( - self.in_feats, self.pdf_feats, self.in_dim) - - pdf_shape = [1]*self.in_dim + self._proj = self._make_proj(self.in_feats, self.pdf_feats, self.in_dim) + + pdf_shape = [1] * self.in_dim pdf_shape[1] = pdf_feats pdf_shape = tuple(pdf_shape) self.logvar = nn.Parameter(torch.zeros(pdf_shape)) - def forward(self, inputs, prior=None, squeeze_dim=None): if self.project: inputs = self._proj(inputs) # stddev loc = inputs - scale = torch.exp(0.5*self.logvar) + scale = torch.exp(0.5 * self.logvar) if prior is not None: - # we force the variance of the posterior smaller than + # we force the variance of the posterior smaller than # the variance of the prior scale = torch.min(scale, prior.scale) @@ -110,32 +103,28 @@ def forward(self, inputs, prior=None, squeeze_dim=None): return pdf.normal.Normal(loc, scale) - class Tensor2NormalDiagCov(Tensor2PDF): """Transforms a Tensor into Normal distribution - - Applies two linear transformation to the tensors to - obtain the mean and the log-variance. + + Applies two linear transformation to the tensors to + obtain the mean and the log-variance. """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): - super().__init__( - pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) + super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) if self.project: - self._proj = self._make_proj( - self.in_feats, self.pdf_feats*2, self.in_dim) - + self._proj = self._make_proj(self.in_feats, self.pdf_feats * 2, self.in_dim) def forward(self, inputs, prior=None, squeeze_dim=None): if self.project: inputs = self._proj(inputs) - + loc, logvar = inputs.chunk(2, dim=1) - scale = torch.exp(0.5*logvar) + scale = torch.exp(0.5 * logvar) if prior is not None: - # we force the variance of the posterior smaller than + # we force the variance of the posterior smaller than # the variance of the prior scale = torch.min(scale, prior.scale) @@ -149,20 +138,18 @@ def forward(self, inputs, prior=None, squeeze_dim=None): class Tensor2BayNormalICovGivenNormalPrior(Tensor2PDF): """Transforms a Tensor into Normal distribution with identitiy variance - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation """ + def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): - super().__init__( - pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) + super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) if self.project: - self._proj = self._make_proj( - self.in_feats, self.pdf_feats, self.in_dim) + self._proj = self._make_proj(self.in_feats, self.pdf_feats, self.in_dim) - #interpolation factors between prior and ML estimation + # interpolation factors between prior and ML estimation self._alpha = nn.Parameter(torch.zeros(1)) - def forward(self, inputs, prior=None, squeeze_dim=None): if self.project: inputs = self._proj(inputs) @@ -171,7 +158,7 @@ def forward(self, inputs, prior=None, squeeze_dim=None): scale = torch.ones_like(inputs) if prior is not None: alpha = nnf.sigmoid(self._alpha) - loc = alpha * loc + (1 - alpha)*prior.loc + loc = alpha * loc + (1 - alpha) * prior.loc if squeeze_dim is not None: loc = loc.squeeze(dim=squeeze_dim) @@ -180,42 +167,38 @@ def forward(self, inputs, prior=None, squeeze_dim=None): return pdf.normal.Normal(loc, scale) - class Tensor2BayNormalGlobDiagCovGivenNormalPrior(Tensor2PDF): """Transforms a Tensor into Normal distribution - - Input tensor will be the ML mean of the distribution and - the ML standard deviation is a global trainable parameter. - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Input tensor will be the ML mean of the distribution and + the ML standard deviation is a global trainable parameter. + + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): - super().__init__( - pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) + super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) if self.project: - self._proj = self._make_proj( - self.in_feats, self.pdf_feats, self.in_dim) - - pdf_shape = [1]*self.in_dim + self._proj = self._make_proj(self.in_feats, self.pdf_feats, self.in_dim) + + pdf_shape = [1] * self.in_dim pdf_shape[1] = pdf_feats pdf_shape = tuple(pdf_shape) self.logvar = nn.Parameter(torch.zeros(pdf_shape)) - #interpolation factors between prior and ML estimation + # interpolation factors between prior and ML estimation self._alpha = nn.Parameter(torch.zeros(1)) self._beta = nn.Parameter(torch.zeros(1)) - def forward(self, inputs, prior=None, squeeze_dim=None): if self.project: inputs = self._proj(inputs) # stddev loc = inputs - scale = torch.exp(0.5*self.logvar) + scale = torch.exp(0.5 * self.logvar) if prior is not None: # MAP estimation of Gaussian mean and var @@ -227,8 +210,12 @@ def forward(self, inputs, prior=None, squeeze_dim=None): alpha = nnf.sigmoid(self._alpha) beta = nnf.sigmoid(self._beta) delta_loc = loc - prior.loc - loc = alpha * loc + (1 - alpha)*prior.loc - var = beta * scale**2 + (1 - beta)*prior.scale**2 + beta*(1 - alpha)*delta_loc**2 + loc = alpha * loc + (1 - alpha) * prior.loc + var = ( + beta * scale ** 2 + + (1 - beta) * prior.scale ** 2 + + beta * (1 - alpha) * delta_loc ** 2 + ) scale = torch.sqrt(var) if squeeze_dim is not None: @@ -238,35 +225,31 @@ def forward(self, inputs, prior=None, squeeze_dim=None): return pdf.normal.Normal(inputs, scale) - class Tensor2BayNormalDiagCovGivenNormalPrior(Tensor2PDF): """Transforms a Tensor into Normal distribution - - Applies two linear transformation to the tensors to - obtain the maximum likelihood mean and the log-variance. - Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation + Applies two linear transformation to the tensors to + obtain the maximum likelihood mean and the log-variance. + + Uses Bayesian interpolation between Gaussian prior and Maximum Likelihood estimation """ def __init__(self, pdf_feats, project=True, in_feats=None, in_dim=None): - super().__init__( - pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) + super().__init__(pdf_feats, project=project, in_feats=in_feats, in_dim=in_dim) if self.project: - self._proj = self._make_proj( - self.in_feats, self.pdf_feats*2, self.in_dim) + self._proj = self._make_proj(self.in_feats, self.pdf_feats * 2, self.in_dim) - #interpolation factors between prior and ML estimation + # interpolation factors between prior and ML estimation self._alpha = nn.Parameter(torch.zeros(1)) self._beta = nn.Parameter(torch.zeros(1)) - def forward(self, inputs, prior=None, squeeze_dim=None): if self.project: inputs = self._proj(inputs) - + loc, logvar = inputs.chunk(2, dim=1) - scale = torch.exp(0.5*logvar) + scale = torch.exp(0.5 * logvar) if prior is not None: # MAP estimation of Gaussian mean and var # Eq. from Bishop2006 (10.60-10.63) @@ -278,15 +261,15 @@ def forward(self, inputs, prior=None, squeeze_dim=None): beta = nnf.sigmoid(self._beta) delta_loc = loc - prior.loc loc = alpha * loc + (1 - alpha) * prior.loc - var = beta * scale**2 + (1 - beta) * prior.scale**2 + beta * (1 - alpha)*delta_loc**2 + var = ( + beta * scale ** 2 + + (1 - beta) * prior.scale ** 2 + + beta * (1 - alpha) * delta_loc ** 2 + ) scale = torch.sqrt(var) - if squeeze_dim is not None: loc = loc.squeeze(dim=squeeze_dim) scale = scale.squeeze(dim=squeeze_dim) return pdf.normal.Normal(loc, scale) - - - diff --git a/hyperion/torch/layers/tensor2pdf1.py b/hyperion/torch/layers/tensor2pdf1.py index 05a2a56c..87ba3475 100644 --- a/hyperion/torch/layers/tensor2pdf1.py +++ b/hyperion/torch/layers/tensor2pdf1.py @@ -7,21 +7,20 @@ import torch.nn as nn import torch.distributions as pdf + class Tensor2PDF(nn.Module): """Base class for layers that create a prob distribution - from an input tensor + from an input tensor """ + def __init__(self): super(Tensor2PDF, self).__init__() self.tensor2pdfparam_factor = 1 - - class Tensor2NormalICov(Tensor2PDF): - """Transforms a Tensor into Normal distribution with identitiy variance - - """ + """Transforms a Tensor into Normal distribution with identitiy variance""" + def __init__(self): super(Tensor2NormalGlobDiagCov, self).__init__() @@ -30,12 +29,11 @@ def forward(self, loc, prior=None): return pdf.normal.Normal(loc, scale) - class Tensor2NormalGlobDiagCov(Tensor2PDF): """Transforms a Tensor into Normal distribution - - Input tensor will be the mean of the distribution and - the standard deviation is a global trainable parameter. + + Input tensor will be the mean of the distribution and + the standard deviation is a global trainable parameter. """ def __init__(self, shape): @@ -44,9 +42,9 @@ def __init__(self, shape): def forward(self, loc, prior=None): # stddev - scale = torch.exp(0.5*self.logvar) + scale = torch.exp(0.5 * self.logvar) if prior is not None: - # the variance of the posterior should be smaller than + # the variance of the posterior should be smaller than # the variance of the prior scale = torch.min(scale, prior.scale) @@ -55,34 +53,32 @@ def forward(self, loc, prior=None): class Tensor2NormalDiagCov(Tensor2PDF): """Transforms a Tensor into Normal distribution - - Applies two linear transformation to the tensors to - obtain the mean and the log-variance. + + Applies two linear transformation to the tensors to + obtain the mean and the log-variance. """ def __init__(self): super(Tensor2NormalDiagCov, self).__init__() self.tensor2pdfparam_factor = 2 - def forward(self, x, prior=None): # stddev loc, logvar = x.chunk(2, dim=1) logvar = self.logvar(x) - scale = torch.exp(0.5*logvar) + scale = torch.exp(0.5 * logvar) if prior is not None: - # the variance of the posterior should be smaller than + # the variance of the posterior should be smaller than # the variance of the prior scale = torch.min(scale, prior.scale) return pdf.normal.Normal(loc, scale) - # class Tensor2NormalDiagCovLin(Tensor2PDF): # """Transforms a Tensor into Normal distribution - -# Applies two linear transformation to the tensors to + +# Applies two linear transformation to the tensors to # obtain the mean and the log-variance. # """ @@ -104,7 +100,6 @@ def forward(self, x, prior=None): # self.logvar = nn.Conv3d(in_shape[-1], out_shape[-1], kernel_size=1) # else: # raise ValueError('ndim=%d is not supported' % ndim) - # def forward(self, x, prior=None): @@ -113,10 +108,8 @@ def forward(self, x, prior=None): # logvar = self.logvar(x) # scale = torch.exp(0.5*logvar) # if prior is not None: -# # the variance of the posterior should be smaller than +# # the variance of the posterior should be smaller than # # the variance of the prior # scale = torch.min(scale, prior.scale) # return pdf.normal.Normal(loc, scale) - - diff --git a/hyperion/torch/layers/vq.py b/hyperion/torch/layers/vq.py index aafff7d2..98307438 100644 --- a/hyperion/torch/layers/vq.py +++ b/hyperion/torch/layers/vq.py @@ -9,36 +9,39 @@ import torch.nn.functional as F import torch.distributed as dist -class VectorQuantizer(nn.Module): - def __init__(self, num_embed, embed_feats, project=True, in_feats=None, in_dim=None): +class VectorQuantizer(nn.Module): + def __init__( + self, num_embed, embed_feats, project=True, in_feats=None, in_dim=None + ): super().__init__() self.num_embed = num_embed self.embed_feats = embed_feats self.project = project self._proj = None if project: - assert in_feats is not None, ( - 'input channels must be given to make the projection') - assert in_dim is not None, ( - 'input tensor dim must be given to make the projection') + assert ( + in_feats is not None + ), "input channels must be given to make the projection" + assert ( + in_dim is not None + ), "input tensor dim must be given to make the projection" self._proj = self._make_proj(in_feats, embed_feats, in_dim) elif in_feats is not None: assert in_feats == embed_feats, ( - 'in_feats (%d) != embed_feats (%), which is required when project=False' % ( - in_feats, embed_feats)) + "in_feats (%d) != embed_feats (%), which is required when project=False" + % (in_feats, embed_feats) + ) else: in_feats = embed_feats - + self.in_feats = in_feats self.in_dim = in_dim - def __repr__(self): return self.__str__() - def _make_proj(self, in_feats, out_feats, ndims): if ndims == 2: return nn.Linear(in_feats, out_feats) @@ -49,56 +52,68 @@ def _make_proj(self, in_feats, out_feats, ndims): elif ndims == 5: return nn.Conv3d(in_feats, out_feats, kernel_size=1) else: - raise ValueError('ndim=%d is not supported' % ndims) - - + raise ValueError("ndim=%d is not supported" % ndims) class KMeansVectorQuantizer(VectorQuantizer): - def __init__(self, num_embed, embed_feats, commitment_cost=0.25, - project=True, in_feats=None, in_dim=None): + def __init__( + self, + num_embed, + embed_feats, + commitment_cost=0.25, + project=True, + in_feats=None, + in_dim=None, + ): super().__init__( - num_embed, embed_feats, - project=project, in_feats=in_feats, in_dim=in_dim) + num_embed, embed_feats, project=project, in_feats=in_feats, in_dim=in_dim + ) self.commitment_cost = commitment_cost - + self.embed = nn.Parameter(torch.empty(num_embed, embed_feats)) # this how it is init in DeepMind code: # self.embed.weight.data.uniform_(-math.sqrt(3)/math.sqrt(num_embed), math.sqrt(3)/math.sqrt(num_embed)) # or equivalently: # nn.init.kaiming_uniform_(self.embed.weight, mode='fan_in', nonlinearity='linear') # normal seems to give a little better result, but not much, still we need to explore the best init. - nn.init.normal_(self.embed, std=1.) + nn.init.normal_(self.embed, std=1.0) self._log_num_embed = math.log(num_embed) - def __str__(self): - s = ('{}(num_embed={}, embed_feats={}, commitment_cost={}, project={}, ' - 'in_feats={}, in_dim={})').format( - self.__class__.__name__, - self.num_embed, self.embed_feats, self.commitment_cost, - self.project, self.in_feats, self.in_dim) + s = ( + "{}(num_embed={}, embed_feats={}, commitment_cost={}, project={}, " + "in_feats={}, in_dim={})" + ).format( + self.__class__.__name__, + self.num_embed, + self.embed_feats, + self.commitment_cost, + self.project, + self.in_feats, + self.in_dim, + ) return s - def forward(self, inputs, return_r=False): # inputs -> z_e in paper if self.project: inputs = self._proj(inputs) # convert inputs from BCHW -> BHWC - inputs = inputs.transpose(1,-1).contiguous() + inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape - + # Flatten input flat_inputs = inputs.view(-1, self.embed_feats) - + # Calculate distances - d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) - + torch.sum(self.embed**2, dim=1) - - 2 * torch.matmul(flat_inputs, self.embed.t())) - + d2 = ( + torch.sum(flat_inputs ** 2, dim=1, keepdim=True) + + torch.sum(self.embed ** 2, dim=1) + - 2 * torch.matmul(flat_inputs, self.embed.t()) + ) + # Encoding # quantization integer indexes q_idx = torch.argmin(d2, dim=1).unsqueeze(1) @@ -112,9 +127,9 @@ def forward(self, inputs, return_r=False): commitment_loss = F.mse_loss(z_q.detach(), inputs) loss = vq_loss + self.commitment_cost * commitment_loss - #this allows to backprogate the gradients as if the output were equal to z_e - z_q = inputs + (z_q-inputs).detach() - + # this allows to backprogate the gradients as if the output were equal to z_e + z_q = inputs + (z_q - inputs).detach() + # compute the perplexity probs = torch.mean(r, dim=0) log_perplexity = -torch.sum(probs * torch.log(probs + 1e-10)) @@ -124,59 +139,79 @@ def forward(self, inputs, return_r=False): # KL is constant so it doesn't contribute to the training # but we keep it to get a better estimation of the ELBO # in the paper they don't use it - num_spatial_positions = r.size(0)/inputs.size(0) - kldiv_r = self._log_num_embed * num_spatial_positions * torch.ones( - (inputs.size(0),1), device=inputs.device) - + num_spatial_positions = r.size(0) / inputs.size(0) + kldiv_r = ( + self._log_num_embed + * num_spatial_positions + * torch.ones((inputs.size(0), 1), device=inputs.device) + ) + # convert quantized from BHWC -> BCHW - z_q = z_q.transpose(1,-1).contiguous() - output = {'z_q': z_q, - 'loss': loss, - 'kldiv_qrpr': kldiv_r, - 'log_perplexity': log_perplexity } + z_q = z_q.transpose(1, -1).contiguous() + output = { + "z_q": z_q, + "loss": loss, + "kldiv_qrpr": kldiv_r, + "log_perplexity": log_perplexity, + } if return_r: - output['r'] = r + output["r"] = r return output - class MultiKMeansVectorQuantizer(VectorQuantizer): - def __init__(self, num_groups, num_embed, embed_feats, commitment_cost=0.25, - project=True, in_feats=None, in_dim=None): + def __init__( + self, + num_groups, + num_embed, + embed_feats, + commitment_cost=0.25, + project=True, + in_feats=None, + in_dim=None, + ): super().__init__( - num_embed, embed_feats, - project=project, in_feats=in_feats, in_dim=in_dim) + num_embed, embed_feats, project=project, in_feats=in_feats, in_dim=in_dim + ) - assert embed_feats % num_groups == 0, ( - 'VQ latent channels (%d) must be multiple of num_groups (%d)' % ( - embed_feats, num_groups)) + assert ( + embed_feats % num_groups == 0 + ), "VQ latent channels (%d) must be multiple of num_groups (%d)" % ( + embed_feats, + num_groups, + ) self.num_groups = num_groups embed_feats_i = embed_feats // num_groups self.vq_layers = nn.ModuleList([]) for i in range(num_groups): vq_i = KMeansVectorQuantizer( - num_embed, embed_feats_i, commitment_cost, project=False) + num_embed, embed_feats_i, commitment_cost, project=False + ) self.vq_layers.append(vq_i) - @property def commitment_cost(self): return self.vq_layers[0].commitment_cost - def __str__(self): - s = ('{}(num_groups={}, num_embed={}, embed_feats={}, commitment_cost={}, project={}, ' - 'in_feats={}, in_dim={})').format( - self.__class__.__name__, - self.num_groups, - self.num_embed, self.embed_feats, self.commitment_cost, - self.project, self.in_feats, self.in_dim) + s = ( + "{}(num_groups={}, num_embed={}, embed_feats={}, commitment_cost={}, project={}, " + "in_feats={}, in_dim={})" + ).format( + self.__class__.__name__, + self.num_groups, + self.num_embed, + self.embed_feats, + self.commitment_cost, + self.project, + self.in_feats, + self.in_dim, + ) return s - def forward(self, inputs, return_r=False): if self.project: inputs = self._proj(inputs) @@ -186,16 +221,16 @@ def forward(self, inputs, return_r=False): r = [] for i in range(self.num_groups): output_i = self.vq_layers[i](inputs[i], return_r=return_r) - z_qi = output_i['z_q'] - loss_i = output_i['loss'] - kldiv_ri = output_i['kldiv_qrpr'] - H_i = output_i['log_perplexity'] + z_qi = output_i["z_q"] + loss_i = output_i["loss"] + kldiv_ri = output_i["kldiv_qrpr"] + H_i = output_i["log_perplexity"] z_q.append(z_qi) if return_r: - r.append(output_i['r']) + r.append(output_i["r"]) - if i==0: + if i == 0: loss = loss_i kldiv_r = kldiv_ri H = H_i @@ -206,71 +241,86 @@ def forward(self, inputs, return_r=False): z_q = torch.cat(tuple(z_q), dim=1) log_perplexity = H / self.num_groups - output = {'z_q': z_q, - 'loss': loss, - 'kldiv_qrpr': kldiv_r, - 'log_perplexity': log_perplexity } + output = { + "z_q": z_q, + "loss": loss, + "kldiv_qrpr": kldiv_r, + "log_perplexity": log_perplexity, + } if return_r: - output['r'] = r + output["r"] = r return output - - class EMAKMeansVectorQuantizer(VectorQuantizer): - - def __init__(self, num_embed, embed_feats, commitment_cost=0.25, gamma=0.99, eps=1e-5, - project=True, in_feats=None, in_dim=None): + def __init__( + self, + num_embed, + embed_feats, + commitment_cost=0.25, + gamma=0.99, + eps=1e-5, + project=True, + in_feats=None, + in_dim=None, + ): super().__init__( - num_embed, embed_feats, - project=project, in_feats=in_feats, in_dim=in_dim) + num_embed, embed_feats, project=project, in_feats=in_feats, in_dim=in_dim + ) self.num_embed = num_embed self.embed_feats = embed_feats self.commitment_cost = commitment_cost self.gamma = gamma self.eps = eps - - self.register_buffer('embed', torch.empty(num_embed, embed_feats)) - nn.init.normal_(self.embed, std=1.) - - self.register_buffer('_ema_N', torch.zeros(num_embed)) - self.register_buffer('_ema_z_acc', torch.empty(num_embed, embed_feats)) - nn.init.normal_(self._ema_z_acc, std=1.) - - self._log_num_embed = math.log(num_embed) + self.register_buffer("embed", torch.empty(num_embed, embed_feats)) + nn.init.normal_(self.embed, std=1.0) + self.register_buffer("_ema_N", torch.zeros(num_embed)) + self.register_buffer("_ema_z_acc", torch.empty(num_embed, embed_feats)) + nn.init.normal_(self._ema_z_acc, std=1.0) + + self._log_num_embed = math.log(num_embed) def __str__(self): - s = ('{}(num_embed={}, embed_feats={}, commitment_cost={}, ' - 'gamma={}, eps={} project={}, in_feats={}, in_dim={})').format( - self.__class__.__name__, - self.num_embed, self.embed_feats, self.commitment_cost, - self.gamma, self.eps, - self.project, self.in_feats, self.in_dim) + s = ( + "{}(num_embed={}, embed_feats={}, commitment_cost={}, " + "gamma={}, eps={} project={}, in_feats={}, in_dim={})" + ).format( + self.__class__.__name__, + self.num_embed, + self.embed_feats, + self.commitment_cost, + self.gamma, + self.eps, + self.project, + self.in_feats, + self.in_dim, + ) return s - def forward(self, inputs, return_r=False): # inputs -> z_e in paper if self.project: inputs = self._proj(inputs) # convert inputs from BCHW -> BHWC - inputs = inputs.transpose(1,-1).contiguous() + inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape - + # Flatten input flat_inputs = inputs.view(-1, self.embed_feats) - + # Calculate distances - d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) - + torch.sum(self.embed**2, dim=1) - - 2 * torch.matmul(flat_inputs, self.embed.t())) - + d2 = ( + torch.sum(flat_inputs ** 2, dim=1, keepdim=True) + + torch.sum(self.embed ** 2, dim=1) + - 2 * torch.matmul(flat_inputs, self.embed.t()) + ) + # Encoding # quantization integer indexes q_idx = torch.argmin(d2, dim=1).unsqueeze(1) @@ -278,32 +328,36 @@ def forward(self, inputs, return_r=False): r = torch.zeros(q_idx.shape[0], self.num_embed, device=inputs.device) r.scatter_(1, q_idx, 1) z_q = torch.matmul(r, self.embed).view(input_shape) - + # Use Exponetial Moving Average (EMA) to update the embedding vectors if self.training: N = torch.sum(r, dim=0) # required to sync gpus in DDP dist.all_reduce(N, op=dist.ReduceOp.SUM) - + ema_N = self._ema_N * self.gamma + (1 - self.gamma) * N - + N_tot = torch.sum(ema_N) # Laplace smoothing - self._ema_N = ((ema_N + self.eps)/(N_tot + self.num_embed * self.eps) * N_tot).detach() - + self._ema_N = ( + (ema_N + self.eps) / (N_tot + self.num_embed * self.eps) * N_tot + ).detach() + z_acc = torch.matmul(r.t(), flat_inputs) # required to sync gpus in DDP dist.all_reduce(z_acc, op=dist.ReduceOp.SUM) - self._ema_z_acc = (self.gamma*self._ema_z_acc + (1 - self.gamma)*z_acc).detach() - self.embed = (self._ema_z_acc/self._ema_N.unsqueeze(1)).detach() + self._ema_z_acc = ( + self.gamma * self._ema_z_acc + (1 - self.gamma) * z_acc + ).detach() + self.embed = (self._ema_z_acc / self._ema_N.unsqueeze(1)).detach() # Loss commitment_loss = F.mse_loss(z_q.detach(), inputs) loss = self.commitment_cost * commitment_loss - #this allows to backprogate the gradients as if the output were equal to z_e - z_q = inputs + (z_q-inputs).detach() - + # this allows to backprogate the gradients as if the output were equal to z_e + z_q = inputs + (z_q - inputs).detach() + # compute the perplexity probs = torch.mean(r, dim=0) log_perplexity = -torch.sum(probs * torch.log(probs + 1e-10)) @@ -313,51 +367,65 @@ def forward(self, inputs, return_r=False): # KL is constant so it doesn't contribute to the training # but we keep it to get a better estimation of the ELBO # in the paper they don't use it - num_spatial_positions = r.size(0)/inputs.size(0) - kldiv_r = self._log_num_embed * num_spatial_positions * torch.ones( - (inputs.size(0),1), device=inputs.device) - + num_spatial_positions = r.size(0) / inputs.size(0) + kldiv_r = ( + self._log_num_embed + * num_spatial_positions + * torch.ones((inputs.size(0), 1), device=inputs.device) + ) + # convert quantized from BHWC -> BCHW - z_q = z_q.transpose(1,-1).contiguous() - output = {'z_q': z_q, - 'loss': loss, - 'kldiv_qrpr': kldiv_r, - 'log_perplexity': log_perplexity } + z_q = z_q.transpose(1, -1).contiguous() + output = { + "z_q": z_q, + "loss": loss, + "kldiv_qrpr": kldiv_r, + "log_perplexity": log_perplexity, + } if return_r: - output['r'] = r + output["r"] = r return output - - class MultiEMAKMeansVectorQuantizer(VectorQuantizer): - def __init__(self, num_groups, num_embed, embed_feats, commitment_cost=0.25, gamma=0.99, eps=1e-5, - project=True, in_feats=None, in_dim=None): + def __init__( + self, + num_groups, + num_embed, + embed_feats, + commitment_cost=0.25, + gamma=0.99, + eps=1e-5, + project=True, + in_feats=None, + in_dim=None, + ): super().__init__( - num_embed, embed_feats, - project=project, in_feats=in_feats, in_dim=in_dim) + num_embed, embed_feats, project=project, in_feats=in_feats, in_dim=in_dim + ) - assert embed_feats % embed_feats == 0, ( - 'VQ latent channels (%d) must be multiple of num_groups (%d)' % ( - embed_feats, num_groups)) + assert ( + embed_feats % embed_feats == 0 + ), "VQ latent channels (%d) must be multiple of num_groups (%d)" % ( + embed_feats, + num_groups, + ) self.num_groups = num_groups embed_feats_i = embed_feats // num_groups self.vq_layers = nn.ModuleList([]) for i in range(num_groups): vq_i = EMAKMeansVectorQuantizer( - num_embed, embed_feats_i, commitment_cost, - gamma, eps, project=False) + num_embed, embed_feats_i, commitment_cost, gamma, eps, project=False + ) self.vq_layers.append(vq_i) - @property def commitment_cost(self): return self.vq_layers[0].commitment_cost - @property def gamma(self): return self.vq_layers[0].gamma @@ -366,18 +434,24 @@ def gamma(self): def eps(self): return self.vq_layers[0].eps - def __str__(self): - s = ('{}(num_groups={}, num_embed={}, embed_feats={}, commitment_cost={}, ' - 'gamma={}, eps={} project={}, in_feats={}, in_dim={})').format( - self.__class__.__name__, - self.num_groups, - self.num_embed, self.embed_feats, self.commitment_cost, - self.gamma, self.eps, - self.project, self.in_feats, self.in_dim) + s = ( + "{}(num_groups={}, num_embed={}, embed_feats={}, commitment_cost={}, " + "gamma={}, eps={} project={}, in_feats={}, in_dim={})" + ).format( + self.__class__.__name__, + self.num_groups, + self.num_embed, + self.embed_feats, + self.commitment_cost, + self.gamma, + self.eps, + self.project, + self.in_feats, + self.in_dim, + ) return s - def forward(self, inputs, return_r=False): if self.project: inputs = self._proj(inputs) @@ -387,16 +461,16 @@ def forward(self, inputs, return_r=False): r = [] for i in range(self.num_groups): output_i = self.vq_layers[i](inputs[i]) - z_qi = output_i['z_q'] - loss_i = output_i['loss'] - kldiv_ri = output_i['kldiv_qrpr'] - H_i = output_i['log_perplexity'] + z_qi = output_i["z_q"] + loss_i = output_i["loss"] + kldiv_ri = output_i["kldiv_qrpr"] + H_i = output_i["log_perplexity"] z_q.append(z_qi) if return_r: - r.append(output_i['r']) + r.append(output_i["r"]) - if i==0: + if i == 0: loss = loss_i kldiv_r = kldiv_ri H = H_i @@ -407,13 +481,15 @@ def forward(self, inputs, return_r=False): z_q = torch.cat(tuple(z_q), dim=1) loss /= self.num_groups - log_perplexity = H/self.num_groups - output = {'z_q': z_q, - 'loss': loss, - 'kldiv_qrpr': kldiv_r, - 'log_perplexity': log_perplexity } + log_perplexity = H / self.num_groups + output = { + "z_q": z_q, + "loss": loss, + "kldiv_qrpr": kldiv_r, + "log_perplexity": log_perplexity, + } if return_r: - output['r'] = r + output["r"] = r return output diff --git a/hyperion/torch/layers/vq1.py b/hyperion/torch/layers/vq1.py index 846b00c3..c134b1b6 100644 --- a/hyperion/torch/layers/vq1.py +++ b/hyperion/torch/layers/vq1.py @@ -16,32 +16,33 @@ def __init__(self, num_embed, embed_dim, commitment_cost=0.25): self.num_embed = num_embed self.embed_dim = embed_dim self.commitment_cost = commitment_cost - - #self.embed = nn.Embedding(num_embed, embed_dim) + + # self.embed = nn.Embedding(num_embed, embed_dim) self.embed = nn.Parameter(torch.empty(num_embed, embed_dim)) # this how it is init in DeepMind code: # self.embed.weight.data.uniform_(-math.sqrt(3)/math.sqrt(num_embed), math.sqrt(3)/math.sqrt(num_embed)) # or equivalently: # nn.init.kaiming_uniform_(self.embed.weight, mode='fan_in', nonlinearity='linear') # normal seems to give a little better result, but not much, still we need to explore the best init. - nn.init.normal_(self.embed, std=1.) + nn.init.normal_(self.embed, std=1.0) self._log_num_embed = math.log(num_embed) - def forward(self, inputs): # inputs -> z_e in paper # convert inputs from BCHW -> BHWC - inputs = inputs.transpose(1,-1).contiguous() + inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape - + # Flatten input flat_inputs = inputs.view(-1, self.embed_dim) - + # Calculate distances - d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) - + torch.sum(self.embed**2, dim=1) - - 2 * torch.matmul(flat_inputs, self.embed.t())) - + d2 = ( + torch.sum(flat_inputs ** 2, dim=1, keepdim=True) + + torch.sum(self.embed ** 2, dim=1) + - 2 * torch.matmul(flat_inputs, self.embed.t()) + ) + # Encoding # quantization integer indexes q_idx = torch.argmin(d2, dim=1).unsqueeze(1) @@ -50,16 +51,16 @@ def forward(self, inputs): r.scatter_(1, q_idx, 1) z_q = torch.matmul(r, self.embed).view(input_shape) - #z_q = self.embed(q_idx).view(input_shape) - + # z_q = self.embed(q_idx).view(input_shape) + # Loss vq_loss = F.mse_loss(z_q, inputs.detach()) commitment_loss = F.mse_loss(z_q.detach(), inputs) loss = vq_loss + self.commitment_cost * commitment_loss - #this allows to backprogate the gradients as if the output were equal to z_e - z_q = inputs + (z_q-inputs).detach() - + # this allows to backprogate the gradients as if the output were equal to z_e + z_q = inputs + (z_q - inputs).detach() + # compute the perplexity probs = torch.mean(r, dim=0) perplexity = torch.exp(-torch.sum(probs * torch.log(probs + 1e-10))) @@ -69,29 +70,31 @@ def forward(self, inputs): # KL is constant so it doesn't contribute to the training # but we keep it to get a better estimation of the ELBO # in the paper they don't use it - num_spatial_positions = r.size(0)/inputs.size(0) - kldiv_r = self._log_num_embed * num_spatial_positions * torch.ones( - (inputs.size(0),1), device=inputs.device) - + num_spatial_positions = r.size(0) / inputs.size(0) + kldiv_r = ( + self._log_num_embed + * num_spatial_positions + * torch.ones((inputs.size(0), 1), device=inputs.device) + ) + # convert quantized from BHWC -> BCHW - z_q = z_q.transpose(1,-1).contiguous() + z_q = z_q.transpose(1, -1).contiguous() return z_q, loss, kldiv_r, perplexity - # def forward(self, inputs): # # inputs -> z_e in paper # # convert inputs from BCHW -> BHWC # inputs = inputs.transpose(1,-1).contiguous() # input_shape = inputs.shape - + # # Flatten input # flat_inputs = inputs.view(-1, self.embed_dim) - + # # Calculate distances - # d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) + # d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) # + torch.sum(self.embed.weight**2, dim=1) # - 2 * torch.matmul(flat_inputs, self.embed.weight.t())) - + # # Encoding # # quantization integer indexes # q_idx = torch.argmin(d2, dim=1).unsqueeze(1) @@ -101,7 +104,7 @@ def forward(self, inputs): # z_q = torch.matmul(r, self.embed.weight).view(input_shape) # #z_q = self.embed(q_idx).view(input_shape) - + # # Loss # vq_loss = F.mse_loss(z_q, inputs.detach()) # commitment_loss = F.mse_loss(z_q.detach(), inputs) @@ -109,7 +112,7 @@ def forward(self, inputs): # #this allows to backprogate the gradients as if the output were equal to z_e # z_q = inputs + (z_q-inputs).detach() - + # # compute the perplexity # probs = torch.mean(r, dim=0) # perplexity = torch.exp(-torch.sum(probs * torch.log(probs + 1e-10))) @@ -122,19 +125,21 @@ def forward(self, inputs): # num_spatial_positions = r.size(0)/inputs.size(0) # kldiv_r = self._log_num_embed * num_spatial_positions * torch.ones( # (inputs.size(0),1), device=inputs.device) - + # # convert quantized from BHWC -> BCHW # z_q = z_q.transpose(1,-1).contiguous() # return z_q, loss, kldiv_r, perplexity - class KMeansMultiVectorQuantizer(nn.Module): def __init__(self, num_groups, num_embed, embed_dim, commitment_cost=0.25): super().__init__() - assert embed_dim % embed_dim == 0, ( - 'VQ latent channels (%d) must be multiple of num_groups (%d)' % ( - embed_dim, num_groups)) + assert ( + embed_dim % embed_dim == 0 + ), "VQ latent channels (%d) must be multiple of num_groups (%d)" % ( + embed_dim, + num_groups, + ) self.num_groups = num_groups self.embed_dim = embed_dim @@ -144,24 +149,21 @@ def __init__(self, num_groups, num_embed, embed_dim, commitment_cost=0.25): vq_i = KMeansVectorQuantizer(num_embed, embed_dim_i, commitment_cost) self.vq_layers.append(vq_i) - @property def num_embed(self): return self.vq_layers[0].num_embed - @property def commitment_cost(self): return self.vq_layers[0].commitment_cost - def forward(self, inputs): inputs = inputs.chunk(self.num_groups, dim=1) z_q = [] for i in range(self.num_groups): z_qi, loss_i, kldiv_ri, p_i = self.vq_layers[i](inputs[i]) z_q.append(z_qi) - if i==0: + if i == 0: loss = loss_i kldiv_r = kldiv_ri perplexity = p_i @@ -175,13 +177,12 @@ def forward(self, inputs): perplexity /= self.num_groups return z_q, loss, kldiv_r, perplexity - - class EMAKMeansVectorQuantizer(nn.Module): - - def __init__(self, num_embed, embed_dim, commitment_cost=0.25, gamma=0.99, eps=1e-5): + def __init__( + self, num_embed, embed_dim, commitment_cost=0.25, gamma=0.99, eps=1e-5 + ): super().__init__() self.num_embed = num_embed @@ -189,35 +190,36 @@ def __init__(self, num_embed, embed_dim, commitment_cost=0.25, gamma=0.99, eps=1 self.commitment_cost = commitment_cost self.gamma = gamma self.eps = eps - - #self.embed = nn.Embedding(num_embed, embed_dim) - #self.embed.weight.data.normal_() - self.register_buffer('embed', torch.empty(num_embed, embed_dim)) - nn.init.normal_(self.embed, std=1.) - - self.register_buffer('_ema_N', torch.zeros(num_embed)) - self.register_buffer('_ema_z_acc', torch.empty(num_embed, embed_dim)) - nn.init.normal_(self._ema_z_acc, std=1.) - #self._ema_z_acc = nn.Parameter(torch.Tensor(num_embed, embed_dim)) - #self._ema_z_acc.data.normal_() - - self._log_num_embed = math.log(num_embed) + # self.embed = nn.Embedding(num_embed, embed_dim) + # self.embed.weight.data.normal_() + self.register_buffer("embed", torch.empty(num_embed, embed_dim)) + nn.init.normal_(self.embed, std=1.0) + + self.register_buffer("_ema_N", torch.zeros(num_embed)) + self.register_buffer("_ema_z_acc", torch.empty(num_embed, embed_dim)) + nn.init.normal_(self._ema_z_acc, std=1.0) + # self._ema_z_acc = nn.Parameter(torch.Tensor(num_embed, embed_dim)) + # self._ema_z_acc.data.normal_() + + self._log_num_embed = math.log(num_embed) def forward(self, inputs): # inputs -> z_e in paper # convert inputs from BCHW -> BHWC - inputs = inputs.transpose(1,-1).contiguous() + inputs = inputs.transpose(1, -1).contiguous() input_shape = inputs.shape - + # Flatten input flat_inputs = inputs.view(-1, self.embed_dim) - + # Calculate distances - d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) - + torch.sum(self.embed**2, dim=1) - - 2 * torch.matmul(flat_inputs, self.embed.t())) - + d2 = ( + torch.sum(flat_inputs ** 2, dim=1, keepdim=True) + + torch.sum(self.embed ** 2, dim=1) + - 2 * torch.matmul(flat_inputs, self.embed.t()) + ) + # Encoding # quantization integer indexes q_idx = torch.argmin(d2, dim=1).unsqueeze(1) @@ -225,27 +227,31 @@ def forward(self, inputs): r = torch.zeros(q_idx.shape[0], self.num_embed, device=inputs.device) r.scatter_(1, q_idx, 1) z_q = torch.matmul(r, self.embed).view(input_shape) - + # Use Exponetial Moving Average (EMA) to update the embedding vectors if self.training: N = torch.sum(r, dim=0) ema_N = self._ema_N * self.gamma + (1 - self.gamma) * N - + N_tot = torch.sum(ema_N) # Laplace smoothing - self._ema_N = ((ema_N + self.eps)/(N_tot + self.num_embed * self.eps) * N_tot).detach() - + self._ema_N = ( + (ema_N + self.eps) / (N_tot + self.num_embed * self.eps) * N_tot + ).detach() + z_acc = torch.matmul(r.t(), flat_inputs) - self._ema_z_acc = (self.gamma*self._ema_z_acc + (1 - self.gamma)*z_acc).detach() - self.embed = (self._ema_z_acc/self._ema_N.unsqueeze(1)).detach() + self._ema_z_acc = ( + self.gamma * self._ema_z_acc + (1 - self.gamma) * z_acc + ).detach() + self.embed = (self._ema_z_acc / self._ema_N.unsqueeze(1)).detach() # Loss commitment_loss = F.mse_loss(z_q.detach(), inputs) loss = self.commitment_cost * commitment_loss - #this allows to backprogate the gradients as if the output were equal to z_e - z_q = inputs + (z_q-inputs).detach() - + # this allows to backprogate the gradients as if the output were equal to z_e + z_q = inputs + (z_q - inputs).detach() + # compute the perplexity probs = torch.mean(r, dim=0) perplexity = torch.exp(-torch.sum(probs * torch.log(probs + 1e-10))) @@ -255,30 +261,31 @@ def forward(self, inputs): # KL is constant so it doesn't contribute to the training # but we keep it to get a better estimation of the ELBO # in the paper they don't use it - num_spatial_positions = r.size(0)/inputs.size(0) - kldiv_r = self._log_num_embed * num_spatial_positions * torch.ones( - (inputs.size(0),1), device=inputs.device) - + num_spatial_positions = r.size(0) / inputs.size(0) + kldiv_r = ( + self._log_num_embed + * num_spatial_positions + * torch.ones((inputs.size(0), 1), device=inputs.device) + ) + # convert quantized from BHWC -> BCHW - z_q = z_q.transpose(1,-1).contiguous() + z_q = z_q.transpose(1, -1).contiguous() return z_q, loss, kldiv_r, perplexity - - # def forward(self, inputs): # # inputs -> z_e in paper # # convert inputs from BCHW -> BHWC # inputs = inputs.transpose(1,-1).contiguous() # input_shape = inputs.shape - + # # Flatten input # flat_inputs = inputs.view(-1, self.embed_dim) - + # # Calculate distances - # d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) + # d2 = (torch.sum(flat_inputs**2, dim=1, keepdim=True) # + torch.sum(self.embed.weight**2, dim=1) # - 2 * torch.matmul(flat_inputs, self.embed.weight.t())) - + # # Encoding # # quantization integer indexes # q_idx = torch.argmin(d2, dim=1).unsqueeze(1) @@ -286,23 +293,23 @@ def forward(self, inputs): # r = torch.zeros(q_idx.shape[0], self.num_embed, device=inputs.device) # r.scatter_(1, q_idx, 1) # z_q = torch.matmul(r, self.embed.weight).view(input_shape) - + # # Use Exponetial Moving Average (EMA) to update the embedding vectors # if self.training: # N = torch.sum(r, dim=0) # self._ema_N = self._ema_N * self.gamma + (1 - self.gamma) * N - + # N_tot = torch.sum(self._ema_N.data) # # Laplace smoothing # self._ema_N = (self._ema_N + self.eps)/(N_tot + self.num_embed * self.eps) * N_tot - + # z_acc = torch.matmul(r.t(), flat_inputs) # self._ema_z_acc = nn.Parameter( - # self.gamma*self._ema_z_acc + (1 - self.gamma)*z_acc, + # self.gamma*self._ema_z_acc + (1 - self.gamma)*z_acc, # requires_grad=False) - + # self.embed.weight = nn.Parameter( - # self._ema_z_acc/self._ema_N.unsqueeze(1), + # self._ema_z_acc/self._ema_N.unsqueeze(1), # requires_grad=False) # # Loss @@ -311,7 +318,7 @@ def forward(self, inputs): # #this allows to backprogate the gradients as if the output were equal to z_e # z_q = inputs + (z_q-inputs).detach() - + # # compute the perplexity # probs = torch.mean(r, dim=0) # perplexity = torch.exp(-torch.sum(probs * torch.log(probs + 1e-10))) @@ -324,40 +331,48 @@ def forward(self, inputs): # num_spatial_positions = r.size(0)/inputs.size(0) # kldiv_r = self._log_num_embed * num_spatial_positions * torch.ones( # (inputs.size(0),1), device=inputs.device) - + # # convert quantized from BHWC -> BCHW # z_q = z_q.transpose(1,-1).contiguous() # return z_q, loss, kldiv_r, perplexity - - class MultiEMAKMeansVectorQuantizer(nn.Module): - def __init__(self, num_groups, num_embed, embed_dim, commitment_cost=0.25, gamma=0.99, eps=1e-5): + def __init__( + self, + num_groups, + num_embed, + embed_dim, + commitment_cost=0.25, + gamma=0.99, + eps=1e-5, + ): super().__init__() - assert embed_dim % embed_dim == 0, ( - 'VQ latent channels (%d) must be multiple of num_groups (%d)' % ( - embed_dim, num_groups)) + assert ( + embed_dim % embed_dim == 0 + ), "VQ latent channels (%d) must be multiple of num_groups (%d)" % ( + embed_dim, + num_groups, + ) self.num_groups = num_groups self.embed_dim = embed_dim embed_dim_i = embed_dim // num_groups self.vq_layers = nn.ModuleList([]) for i in range(num_groups): - vq_i = EMAKMeansVectorQuantizer(num_embed, embed_dim_i, commitment_cost, gamma, eps) + vq_i = EMAKMeansVectorQuantizer( + num_embed, embed_dim_i, commitment_cost, gamma, eps + ) self.vq_layers.append(vq_i) - @property def num_embed(self): return self.vq_layers[0].num_embed - @property def commitment_cost(self): return self.vq_layers[0].commitment_cost - @property def gamma(self): return self.vq_layers[0].gamma @@ -366,14 +381,13 @@ def gamma(self): def eps(self): return self.vq_layers[0].eps - def forward(self, inputs): inputs = inputs.chunk(self.num_groups, dim=1) z_q = [] for i in range(self.num_groups): z_qi, loss_i, kldiv_ri, p_i = self.vq_layers[i](inputs[i]) z_q.append(z_qi) - if i==0: + if i == 0: loss = loss_i kldiv_r = kldiv_ri perplexity = p_i diff --git a/hyperion/torch/loggers/csv_logger.py b/hyperion/torch/loggers/csv_logger.py index 7c2372ce..402ddcd5 100644 --- a/hyperion/torch/loggers/csv_logger.py +++ b/hyperion/torch/loggers/csv_logger.py @@ -20,7 +20,8 @@ class CSVLogger(Logger): sep: column separator for csv file append: False, overwrite existing file, True, appends. """ - def __init__(self, file_path, sep=',', append=False): + + def __init__(self, file_path, sep=",", append=False): super().__init__() self.file_path = file_path self.sep = sep @@ -29,56 +30,53 @@ def __init__(self, file_path, sep=',', append=False): self.csv_file = None self.append_header = True self.log_keys = None - - def on_train_begin(self, logs=None, **kwargs): if self.rank != 0: return - - file_dir = os.path.dirname(self.file_path) + + file_dir = os.path.dirname(self.file_path) if not os.path.exists(file_dir): os.makedirs(file_dir) - + if self.append: if os.path.exists(self.file_path): - with open(self.file_path, 'r') as f: - self.append_header = len(f.readline())==0 + with open(self.file_path, "r") as f: + self.append_header = len(f.readline()) == 0 if self.append_header: - self.csv_file = open(self.file_path, 'w') + self.csv_file = open(self.file_path, "w") else: - self.csv_file = open(self.file_path, 'a') + self.csv_file = open(self.file_path, "a") - def on_epoch_end(self, logs=None, **kwargs): - if self.rank !=0: + if self.rank != 0: return logs = logs or {} - + if self.log_keys is None: self.log_keys = list(logs.keys()) if not self.csv_writer: + class MyDialect(csv.excel): delimiter = self.sep - fieldnames = ['epoch'] + self.log_keys + + fieldnames = ["epoch"] + self.log_keys self.csv_writer = csv.DictWriter( - self.csv_file, fieldnames=fieldnames, dialect=MyDialect) + self.csv_file, fieldnames=fieldnames, dialect=MyDialect + ) if self.append_header: self.csv_writer.writeheader() - row = ODict([('epoch', self.cur_epoch+1)]) + row = ODict([("epoch", self.cur_epoch + 1)]) row.update((k, logs[k]) for k in self.log_keys) self.csv_writer.writerow(row) self.csv_file.flush() - def on_train_end(self, logs=None, **kwargs): if self.rank != 0: return self.csv_file.close() self.writer = None - - diff --git a/hyperion/torch/loggers/logger.py b/hyperion/torch/loggers/logger.py index 480b3375..46c1130d 100644 --- a/hyperion/torch/loggers/logger.py +++ b/hyperion/torch/loggers/logger.py @@ -9,10 +9,11 @@ class Logger(object): """Base class for logger objects - + Attributes: params: training params dictionary """ + def __init__(self): try: rank = dist.get_rank() @@ -22,43 +23,39 @@ def __init__(self): world_size = 1 self.cur_epoch = 0 self.cur_batch = 0 - self.params=None + self.params = None self.rank = rank self.world_size = world_size - def on_epoch_begin(self, epoch, logs, **kwargs): """At the start of an epoch - + Args: epoch: index of the epoch logs: dictionary of logs """ self.cur_epoch = epoch - - + def on_epoch_end(self, logs, **kwargs): """At the end of an epoch - + Args: logs: dictionary of logs """ pass - def on_batch_begin(self, batch, logs, **kwargs): """At the start of a batch - + Args: batch: batch index within the epoch logs: dictionary of logs """ - self.cur_batch = batch - + self.cur_batch = batch def on_batch_end(self, logs, **kwargs): """At the end of a batch - + Args: batch: batch index within the epoch logs: dictionary of logs @@ -67,7 +64,7 @@ def on_batch_end(self, logs, **kwargs): def on_train_begin(self, logs, **kwargs): """At the start of training - + Args: logs: dictionary of logs """ @@ -75,13 +72,9 @@ def on_train_begin(self, logs, **kwargs): def on_train_end(self, logs, **kwargs): """At the end of training - + Args: batch: batch index within the epoch logs: dictionary of logs """ pass - - - - diff --git a/hyperion/torch/loggers/logger_list.py b/hyperion/torch/loggers/logger_list.py index 938cc79a..20ae58ec 100644 --- a/hyperion/torch/loggers/logger_list.py +++ b/hyperion/torch/loggers/logger_list.py @@ -15,31 +15,28 @@ class LoggerList(object): Attributes: loggers: list of Logger objects """ + def __init__(self, loggers=None): self.loggers = loggers or [] - def append(self, logger): self.loggers.append(logger) - @property def tensorboard_logger(self): for l in self.loggers: if isinstance(l, TBL): return l - @property def tensorboard_writer(self): for l in self.loggers: if isinstance(l, TBL): return l.writer - def on_epoch_begin(self, epoch, logs=None, **kwargs): """At the start of an epoch - + Args: epoch: index of the epoch logs: dictionary of logs @@ -48,10 +45,9 @@ def on_epoch_begin(self, epoch, logs=None, **kwargs): for logger in self.loggers: logger.on_epoch_begin(epoch, logs, **kwargs) - def on_epoch_end(self, logs=None, **kwargs): """At the end of an epoch - + Args: epoch: index of the epoch logs: dictionary of logs @@ -60,10 +56,9 @@ def on_epoch_end(self, logs=None, **kwargs): for logger in self.loggers: logger.on_epoch_end(logs, **kwargs) - def on_batch_begin(self, batch, logs=None, **kwargs): """At the start of a batch - + Args: batch: batch index within the epoch logs: dictionary of logs @@ -72,10 +67,9 @@ def on_batch_begin(self, batch, logs=None, **kwargs): for logger in self.loggers: logger.on_batch_begin(batch, logs, **kwargs) - def on_batch_end(self, logs=None, **kwargs): """At the end of a batch - + Args: batch: batch index within the epoch logs: dictionary of logs @@ -84,10 +78,9 @@ def on_batch_end(self, logs=None, **kwargs): for logger in self.loggers: logger.on_batch_end(logs, **kwargs) - def on_train_begin(self, logs=None, **kwargs): """At the start of training - + Args: logs: dictionary of logs """ @@ -95,10 +88,9 @@ def on_train_begin(self, logs=None, **kwargs): for logger in self.loggers: logger.on_train_begin(logs, **kwargs) - def on_train_end(self, logs=None, **kwargs): """At the end of training - + Args: batch: batch index within the epoch logs: dictionary of logs @@ -107,8 +99,5 @@ def on_train_end(self, logs=None, **kwargs): for logger in self.loggers: logger.on_train_end(logs, **kwargs) - def __iter__(self): return iter(self.loggers) - - diff --git a/hyperion/torch/loggers/prog_logger.py b/hyperion/torch/loggers/prog_logger.py index fff70f33..26479197 100644 --- a/hyperion/torch/loggers/prog_logger.py +++ b/hyperion/torch/loggers/prog_logger.py @@ -19,11 +19,12 @@ class ProgLogger(Logger): metrics: list of metrics interval: number of batches between prints """ + def __init__(self, metrics=None, interval=10): super().__init__() self.metrics = None if metrics is None else set(metrics) - + self.interval = interval self.epochs = 0 self.batches = 0 @@ -32,94 +33,93 @@ def __init__(self, metrics=None, interval=10): self.cur_batch = 0 self.cur_sample = 0 self.t0 = 0 - def on_train_begin(self, logs=None, **kwargs): - self.epochs = kwargs['epochs'] - + self.epochs = kwargs["epochs"] def on_epoch_begin(self, epoch, logs=None, **kwargs): if self.rank != 0: - return + return self.cur_epoch = epoch - logging.info('epoch: %d/%d starts' % (epoch+1, self.epochs)) - if 'samples' in kwargs: - self.samples = kwargs['samples'] * self.world_size + logging.info("epoch: %d/%d starts" % (epoch + 1, self.epochs)) + if "samples" in kwargs: + self.samples = kwargs["samples"] * self.world_size else: self.samples = 0 - if 'batches' in kwargs: - self.batches = kwargs['batches'] + if "batches" in kwargs: + self.batches = kwargs["batches"] else: self.batches = 0 self.cur_batch = 0 self.cur_sample = 0 self.t0 = time.time() - def on_batch_begin(self, batch, logs=None, **kwargs): - self.cur_batch = batch - - + self.cur_batch = batch def on_batch_end(self, logs=None, **kwargs): if self.rank != 0: - return + return batch_size = 0 - if 'batch_size' in kwargs: - batch_size = kwargs['batch_size'] * self.world_size - self.cur_sample += batch_size + if "batch_size" in kwargs: + batch_size = kwargs["batch_size"] * self.world_size + self.cur_sample += batch_size self.cur_batch += 1 if (self.cur_batch % self.interval) == 0: - info = 'epoch: %d/%d ' % (self.cur_epoch+1, self.epochs) + info = "epoch: %d/%d " % (self.cur_epoch + 1, self.epochs) etime, eta = self.estimate_epoch_time() if eta == None: - info += ' et: %s' % (etime) + info += " et: %s" % (etime) else: - info += ' et: %s eta: %s' % (etime, eta) - + info += " et: %s eta: %s" % (etime, eta) + if self.batches > 0: - info += ' batches: %d/%d(%d%%)' % ( - self.cur_batch, self.batches, int(100*self.cur_batch/self.batches)) + info += " batches: %d/%d(%d%%)" % ( + self.cur_batch, + self.batches, + int(100 * self.cur_batch / self.batches), + ) else: - info += ' batches: %d' % (self.cur_batch) + info += " batches: %d" % (self.cur_batch) if self.cur_sample > 0: if self.samples > 0: - info += ' samples: %d/%d(%d%%)' % ( - self.cur_sample, self.samples, int(100*self.cur_sample/self.samples)) + info += " samples: %d/%d(%d%%)" % ( + self.cur_sample, + self.samples, + int(100 * self.cur_sample / self.samples), + ) else: - info += ' samples: %d' % (self.cur_sample) + info += " samples: %d" % (self.cur_sample) for k, v in logs.items(): if self.metrics is None or k in self.metrics: - info += ' %s: %.6f' % (k, v) - - logging.info(info) + info += " %s: %.6f" % (k, v) + logging.info(info) def on_epoch_end(self, logs=None, **kwargs): if self.rank != 0: - return + return - info = 'epoch: %d/%d ' % (self.cur_epoch+1, self.epochs) + info = "epoch: %d/%d " % (self.cur_epoch + 1, self.epochs) for k, v in logs.items(): if self.metrics is None or k in self.metrics: - info += ' %s: %.6f' % (k, v) - + info += " %s: %.6f" % (k, v) def estimate_epoch_time(self): t1 = time.time() et = t1 - self.t0 if self.batches > 0: - total_t = et/self.cur_batch * self.batches + total_t = et / self.cur_batch * self.batches elif self.samples > 0 and self: - total_t = et/self.cur_sample * self.samples + total_t = et / self.cur_sample * self.samples else: total_t = -1 @@ -127,25 +127,20 @@ def estimate_epoch_time(self): if total_t == -1: eta = None else: - eta = self.sec2str(total_t-et) + eta = self.sec2str(total_t - et) return etime, eta - @staticmethod def sec2str(t): t = time.gmtime(t) if t.tm_mday > 1: - st = '%d:%02d:%02d:%02d' % (t.tm_mday-1, t.tm_hour, t.tm_min, t.tm_sec) + st = "%d:%02d:%02d:%02d" % (t.tm_mday - 1, t.tm_hour, t.tm_min, t.tm_sec) elif t.tm_hour > 0: - st = '%d:%02d:%02d' % (t.tm_hour, t.tm_min, t.tm_sec) + st = "%d:%02d:%02d" % (t.tm_hour, t.tm_min, t.tm_sec) elif t.tm_min > 0: - st = '%d:%02d' % (t.tm_min, t.tm_sec) - else: - st = '%ds' % (t.tm_sec) - + st = "%d:%02d" % (t.tm_min, t.tm_sec) + else: + st = "%ds" % (t.tm_sec) + return st - - - - diff --git a/hyperion/torch/loggers/tensorboard_logger.py b/hyperion/torch/loggers/tensorboard_logger.py index ffcab652..314757d1 100644 --- a/hyperion/torch/loggers/tensorboard_logger.py +++ b/hyperion/torch/loggers/tensorboard_logger.py @@ -10,11 +10,12 @@ class TensorBoardLogger(Logger): """Logger that sends training progress to tensorboard - + Attributes: tb_path: tensorboard output directory - + """ + def __init__(self, tb_path, interval=10): super().__init__() self.tb_path = tb_path @@ -24,46 +25,41 @@ def __init__(self, tb_path, interval=10): self.cur_epoch = 0 self.cur_batch = 0 - def on_train_begin(self, logs=None, **kwargs): if self.rank != 0: return - - self.writer = SummaryWriter(self.tb_path) + self.writer = SummaryWriter(self.tb_path) def on_epoch_begin(self, epoch, logs=None, **kwargs): if self.rank != 0: - return + return self.cur_epoch = epoch - if 'batches' in kwargs: - self.batches = kwargs['batches'] + if "batches" in kwargs: + self.batches = kwargs["batches"] else: self.batches = 0 self.cur_batch = 0 - def on_batch_end(self, logs=None, **kwargs): if self.rank != 0: - return + return self.cur_batch += 1 if (self.cur_batch % self.interval) == 0: step = self.cur_epoch * self.batches + self.cur_batch - for k,v in logs.items(): - self.writer.add_scalar(k+'/global_steps', v, step) - + for k, v in logs.items(): + self.writer.add_scalar(k + "/global_steps", v, step) def on_epoch_end(self, logs=None, **kwargs): - if self.rank !=0: + if self.rank != 0: return - - for k,v in logs.items(): - k = re.sub(r'^(train|val)_(.*)$', r'\2/\1', k) - self.writer.add_scalar(k, v, self.cur_epoch+1) - + + for k, v in logs.items(): + k = re.sub(r"^(train|val)_(.*)$", r"\2/\1", k) + self.writer.add_scalar(k, v, self.cur_epoch + 1) def on_train_end(self, logs=None, **kwargs): if self.rank != 0: diff --git a/hyperion/torch/loggers/wandb_logger.py b/hyperion/torch/loggers/wandb_logger.py index a0801718..c864e9b1 100644 --- a/hyperion/torch/loggers/wandb_logger.py +++ b/hyperion/torch/loggers/wandb_logger.py @@ -4,6 +4,7 @@ """ import re import os + try: import wandb except: @@ -14,12 +15,15 @@ class WAndBLogger(Logger): """Logger that sends training progress to weights and biases (wandb) - + Attributes: tb_path: tensorboard output directory - + """ - def __init__(self, project=None, group=None, name=None, path=None, mode='online', interval=10): + + def __init__( + self, project=None, group=None, name=None, path=None, mode="online", interval=10 + ): super().__init__() self.project = project self.path = path @@ -31,62 +35,58 @@ def __init__(self, project=None, group=None, name=None, path=None, mode='online' self.cur_epoch = 0 self.cur_batch = 0 - def on_train_begin(self, logs=None, **kwargs): if self.rank != 0: return - + if self.path is not None: if not os.path.exists(self.path): os.makedirs(self.path) - wandb.init(project=self.project, - group=self.group, - name=self.name, - dir=self.path, - mode=self.mode, - reinit=True) - + wandb.init( + project=self.project, + group=self.group, + name=self.name, + dir=self.path, + mode=self.mode, + reinit=True, + ) def on_epoch_begin(self, epoch, logs=None, **kwargs): if self.rank != 0: - return + return self.cur_epoch = epoch - if 'batches' in kwargs: - self.batches = kwargs['batches'] + if "batches" in kwargs: + self.batches = kwargs["batches"] else: self.batches = 0 self.cur_batch = 0 - def on_batch_end(self, logs=None, **kwargs): if self.rank != 0: - return + return self.cur_batch += 1 if (self.cur_batch % self.interval) == 0: step = self.cur_epoch * self.batches + self.cur_batch - logs = { k+'/global_steps':v for k,v in logs.items() } - logs['batch'] = step + logs = {k + "/global_steps": v for k, v in logs.items()} + logs["batch"] = step wandb.log(logs) # for k,v in logs.items(): # self.writer.add_scalar(k+'/global_steps', v, step) - def on_epoch_end(self, logs=None, **kwargs): - if self.rank !=0: + if self.rank != 0: return - - logs = { re.sub(r'^(train|val)_(.*)$', r'\2/\1', k): v - for k, v in logs.items() } - logs['epoch'] = self.cur_epoch + 1 + + logs = {re.sub(r"^(train|val)_(.*)$", r"\2/\1", k): v for k, v in logs.items()} + logs["epoch"] = self.cur_epoch + 1 wandb.log(logs) # for k,v in logs.items(): # k = re.sub(r'^(train|val)_(.*)$', r'\2/\1', k) # self.writer.add_scalar(k, v, self.cur_epoch+1) - def on_train_end(self, logs=None, **kwargs): if self.rank != 0: diff --git a/hyperion/torch/losses/__init__.py b/hyperion/torch/losses/__init__.py index ded908bc..bf3ce279 100644 --- a/hyperion/torch/losses/__init__.py +++ b/hyperion/torch/losses/__init__.py @@ -4,4 +4,3 @@ """ from .bce_with_llr import BCEWithLLR - diff --git a/hyperion/torch/losses/bce_with_llr.py b/hyperion/torch/losses/bce_with_llr.py index 8b22fc86..552d703a 100644 --- a/hyperion/torch/losses/bce_with_llr.py +++ b/hyperion/torch/losses/bce_with_llr.py @@ -12,16 +12,14 @@ class BCEWithLLR(nn.Module): - def __init__(self, p_tar=0.5): super().__init__() self.p_tar = p_tar - self.logit_ptar = math.log(p_tar/(1-p_tar)) - + self.logit_ptar = math.log(p_tar / (1 - p_tar)) # def forward(self, x, y, is_selfsim=False, is_sim=False, y2=None): # # logging.info('{} {}'.format(x.shape[0], y.shape[0])) - # if is_selfsim or is_sim: + # if is_selfsim or is_sim: # assert x.dim() > 1 # # x is a full score matrix # # y contains the labels of the rows @@ -35,7 +33,7 @@ def __init__(self, p_tar=0.5): # y[y!=1] = 0 # if is_selfsim: # #if it is selfsim we only use the upper trianglaur - # mask=torch.triu(torch.ones_like(x, dtype=torch.bool), + # mask=torch.triu(torch.ones_like(x, dtype=torch.bool), # diagonal=1) # x = x[mask] # y = y[mask] @@ -55,19 +53,15 @@ def __init__(self, p_tar=0.5): # x, y, weight=weight, reduction='mean') # return loss - def forward(self, x, y): y = y.float() ntar = torch.mean(y, dim=0) - nnon = torch.mean(1-y, dim=0) + nnon = torch.mean(1 - y, dim=0) weight_tar = self.p_tar / ntar weight_non = (1 - self.p_tar) / nnon x = x + self.logit_ptar - weight = y * weight_tar + (1-y) * weight_non + weight = y * weight_tar + (1 - y) * weight_non loss = nnf.binary_cross_entropy_with_logits( - x, y, weight=weight, reduction='mean') + x, y, weight=weight, reduction="mean" + ) return loss - - - - diff --git a/hyperion/torch/lr_schedulers/__init__.py b/hyperion/torch/lr_schedulers/__init__.py index 496c7227..f0a3465e 100644 --- a/hyperion/torch/lr_schedulers/__init__.py +++ b/hyperion/torch/lr_schedulers/__init__.py @@ -4,8 +4,6 @@ """ - - from .lr_scheduler import LRScheduler from .red_lr_on_plateau import ReduceLROnPlateau from .exp_lr import ExponentialLR diff --git a/hyperion/torch/lr_schedulers/cos_lr.py b/hyperion/torch/lr_schedulers/cos_lr.py index e25fa5bf..6e36cf2a 100644 --- a/hyperion/torch/lr_schedulers/cos_lr.py +++ b/hyperion/torch/lr_schedulers/cos_lr.py @@ -4,7 +4,6 @@ """ - import math import logging @@ -12,6 +11,7 @@ from .lr_scheduler import LRScheduler + class CosineLR(LRScheduler): r"""Set the learning rate of each parameter group using a cosine annealing schedule, where :math:`\eta_{max}` is set to the initial lr and @@ -25,7 +25,7 @@ class CosineLR(LRScheduler): When epoch=-1, sets initial lr as lr. It has been proposed in - `SGDR: Stochastic Gradient Descent with Warm Restarts`_. + `SGDR: Stochastic Gradient Descent with Warm Restarts`_. Args: optimizer (Optimizer): Wrapped optimizer. @@ -37,12 +37,25 @@ class CosineLR(LRScheduler): https://arxiv.org/abs/1608.03983 """ - def __init__(self, optimizer, T, T_mul=1, min_lr=0, warmup_steps=0, - warm_restarts=False, gamma=1, last_restart=0, num_restarts = 0, - epoch=0, step=0, update_lr_on_opt_step=False): - - super(CosineLR, self).__init__(optimizer, min_lr, warmup_steps, - epoch, step, update_lr_on_opt_step) + def __init__( + self, + optimizer, + T, + T_mul=1, + min_lr=0, + warmup_steps=0, + warm_restarts=False, + gamma=1, + last_restart=0, + num_restarts=0, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ): + + super(CosineLR, self).__init__( + optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step + ) self.T = T self.T_mul = T_mul self.warm_restarts = warm_restarts @@ -50,18 +63,15 @@ def __init__(self, optimizer, T, T_mul=1, min_lr=0, warmup_steps=0, self.num_restarts = num_restarts self.gamma = gamma - def on_epoch_begin(self, epoch=None, epoch_updates=1, **kwargs): super(CosineLR, self).on_epoch_begin(epoch) if self.update_lr_on_opt_step: # T has to correspond to an integer number of epochs - T = int(math.ceil(self.T/epoch_updates)*epoch_updates) + T = int(math.ceil(self.T / epoch_updates) * epoch_updates) if self.T != T: - logging.info('readjusting cos_lr T %d -> %d' % (self.T, T)) + logging.info("readjusting cos_lr T %d -> %d" % (self.T, T)) self.T = T - - - + def get_lr(self, step): x = step - self.last_restart # if x >= self.T and self.update_lr_on_opt_step and self.warm_restarts: @@ -69,24 +79,26 @@ def get_lr(self, step): # if self.epoch == 0: # self.T = x + 1 # logging.info('readjusting cos_lr T to %d' % (self.T)) - #logging.info('cos-get-lr step=%d last=%d T=%d' % (step, self.last_restart, self.T)) + # logging.info('cos-get-lr step=%d last=%d T=%d' % (step, self.last_restart, self.T)) if x >= self.T: if self.warm_restarts: self.last_restart = step x = 0 self.T *= self.T_mul self.num_restarts += 1 - logging.info('cos_lr warm-restart=%d T=%d' % (self.num_restarts, self.T)) + logging.info( + "cos_lr warm-restart=%d T=%d" % (self.num_restarts, self.T) + ) else: return self.min_lrs alpha = self.gamma ** self.num_restarts - r = math.pi/self.T - - return [eta_min + (alpha*eta_max - eta_min) * - (1 + math.cos(r * x)) / 2 - for eta_max, eta_min in zip(self.base_lrs, self.min_lrs)] + r = math.pi / self.T + return [ + eta_min + (alpha * eta_max - eta_min) * (1 + math.cos(r * x)) / 2 + for eta_max, eta_min in zip(self.base_lrs, self.min_lrs) + ] # def epoch_end_step(self, metrics=None): # if self.epoch==0 and self.update_lr_on_opt_step and self.warm_restarts: @@ -96,14 +108,34 @@ def get_lr(self, step): class AdamCosineLR(CosineLR): - - def __init__(self, optimizer, T=1, T_mul=2, warmup_steps=0, - warm_restarts=False, gamma=1, last_restart=0, num_restarts = 0, - epoch=-1, step=-1, update_lr_on_opt_step=False): - super(AdamCosineLR, super).__init__(optimizer, T, T_mul, 0, warmup_steps, - warm_restarts, last_restart, num_restarts, gamma, - epoch, step, update_lr_on_opt_step) - + def __init__( + self, + optimizer, + T=1, + T_mul=2, + warmup_steps=0, + warm_restarts=False, + gamma=1, + last_restart=0, + num_restarts=0, + epoch=-1, + step=-1, + update_lr_on_opt_step=False, + ): + super(AdamCosineLR, super).__init__( + optimizer, + T, + T_mul, + 0, + warmup_steps, + warm_restarts, + last_restart, + num_restarts, + gamma, + epoch, + step, + update_lr_on_opt_step, + ) def get_lr(self, step): x = step - self.last_restart @@ -117,7 +149,8 @@ def get_lr(self, step): return self.min_lrs alpha = gamma ** self.num_restarts - r = math.pi/self.T - - return [alpha * base_lr * 0.5 * (1 + math.cos(r * x)) - for base_lr in self.base_lrs] + r = math.pi / self.T + + return [ + alpha * base_lr * 0.5 * (1 + math.cos(r * x)) for base_lr in self.base_lrs + ] diff --git a/hyperion/torch/lr_schedulers/exp_lr.py b/hyperion/torch/lr_schedulers/exp_lr.py index 140c48a4..cbe00a01 100644 --- a/hyperion/torch/lr_schedulers/exp_lr.py +++ b/hyperion/torch/lr_schedulers/exp_lr.py @@ -4,35 +4,42 @@ """ - import torch from .lr_scheduler import LRScheduler + class ExponentialLR(LRScheduler): - """Exponential learning rate scheduler. - """ - def __init__(self, optimizer, decay_rate, decay_steps, hold_steps, - min_lr=0, warmup_steps=0, - epoch=0, step=0, update_lr_on_opt_step=False): + """Exponential learning rate scheduler.""" + + def __init__( + self, + optimizer, + decay_rate, + decay_steps, + hold_steps, + min_lr=0, + warmup_steps=0, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ): super(ExponentialLR, self).__init__( - optimizer, min_lr, warmup_steps, - epoch, step, update_lr_on_opt_step) - self.decay_rate = decay_rate + optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step + ) + self.decay_rate = decay_rate self.decay_steps = decay_steps self.hold_steps = max(hold_steps, self.warmup_steps) - def get_lr(self, step): if step < self.hold_steps: return self.base_lrs x = step - self.hold_steps - return [max( - min_lr, - base_lr * self.decay_rate ** (x/self.decay_steps)) - for base_lr, min_lr in zip(self.base_lrs, self.min_lrs)] - + return [ + max(min_lr, base_lr * self.decay_rate ** (x / self.decay_steps)) + for base_lr, min_lr in zip(self.base_lrs, self.min_lrs) + ] def load_state_dict(self, state_dict): """Loads the schedulers state. @@ -42,6 +49,6 @@ def load_state_dict(self, state_dict): from a call to :meth:`state_dict`. """ # we only load step and epoch so we can change the scheduler params during training - self.step = state_dict['step'] - self.epoch = state_dict['epoch'] - #self.__dict__.update(state_dict) + self.step = state_dict["step"] + self.epoch = state_dict["epoch"] + # self.__dict__.update(state_dict) diff --git a/hyperion/torch/lr_schedulers/factory.py b/hyperion/torch/lr_schedulers/factory.py index 7d188a91..9e185a7c 100644 --- a/hyperion/torch/lr_schedulers/factory.py +++ b/hyperion/torch/lr_schedulers/factory.py @@ -13,155 +13,250 @@ class LRSchedulerFactory(object): - - def create(optimizer, lrsch_type, - decay_rate=1/100, decay_steps=100, - power=0.5, - hold_steps=10, - t=10, t_mul=1, - warm_restarts=False, gamma=1, - monitor='val_loss', mode='min', - factor=0.1, patience=10, - threshold=1e-4, threshold_mode='rel', - cooldown=0, eps=1e-8, - min_lr=0, warmup_steps=0, update_lr_on_opt_step=False): - - if lrsch_type == 'none': + def create( + optimizer, + lrsch_type, + decay_rate=1 / 100, + decay_steps=100, + power=0.5, + hold_steps=10, + t=10, + t_mul=1, + warm_restarts=False, + gamma=1, + monitor="val_loss", + mode="min", + factor=0.1, + patience=10, + threshold=1e-4, + threshold_mode="rel", + cooldown=0, + eps=1e-8, + min_lr=0, + warmup_steps=0, + update_lr_on_opt_step=False, + ): + + if lrsch_type == "none": return None - - if lrsch_type == 'exp_lr': - return ExponentialLR( - optimizer, decay_rate, decay_steps, hold_steps, - min_lr=min_lr, warmup_steps=warmup_steps, - update_lr_on_opt_step=update_lr_on_opt_step) - if lrsch_type == 'invpow_lr': + if lrsch_type == "exp_lr": + return ExponentialLR( + optimizer, + decay_rate, + decay_steps, + hold_steps, + min_lr=min_lr, + warmup_steps=warmup_steps, + update_lr_on_opt_step=update_lr_on_opt_step, + ) + + if lrsch_type == "invpow_lr": return InvPowLR( - optimizer, power, hold_steps, - min_lr=min_lr, warmup_steps=warmup_steps, - update_lr_on_opt_step=update_lr_on_opt_step) - - - if lrsch_type == 'cos_lr': - return CosineLR(optimizer, t, t_mul, min_lr=min_lr, - warmup_steps=warmup_steps, - warm_restarts=warm_restarts, gamma=gamma, - update_lr_on_opt_step=update_lr_on_opt_step) - - if lrsch_type == 'adamcos_lr': - return AdamCosineLR(optimizer, t, t_mul, warmup_steps=warmup_steps, - warm_restarts=warm_restarts, gamma=gamma, - update_lr_on_opt_step=update_lr_on_opt_step) - - if lrsch_type == 'red_lr_on_plateau': + optimizer, + power, + hold_steps, + min_lr=min_lr, + warmup_steps=warmup_steps, + update_lr_on_opt_step=update_lr_on_opt_step, + ) + + if lrsch_type == "cos_lr": + return CosineLR( + optimizer, + t, + t_mul, + min_lr=min_lr, + warmup_steps=warmup_steps, + warm_restarts=warm_restarts, + gamma=gamma, + update_lr_on_opt_step=update_lr_on_opt_step, + ) + + if lrsch_type == "adamcos_lr": + return AdamCosineLR( + optimizer, + t, + t_mul, + warmup_steps=warmup_steps, + warm_restarts=warm_restarts, + gamma=gamma, + update_lr_on_opt_step=update_lr_on_opt_step, + ) + + if lrsch_type == "red_lr_on_plateau": return ReduceLROnPlateau( - optimizer, monitor, mode, - factor=factor, patience=patience, - threshold=threshold, threshold_mode=threshold_mode, - cooldown=cooldown, min_lr=min_lr, warmup_steps=warmup_steps, eps=eps) - + optimizer, + monitor, + mode, + factor=factor, + patience=patience, + threshold=threshold, + threshold_mode=threshold_mode, + cooldown=cooldown, + min_lr=min_lr, + warmup_steps=warmup_steps, + eps=eps, + ) @staticmethod def filter_args(**kwargs): - valid_args = ('lrsch_type', 'decay_rate', 'decay_steps', 'hold_steps', 'power', - 't', 't_mul', 'warm_restarts', 'gamma', 'monitor', - 'mode','factor','patience','threshold', - 'threshold_mode','cooldown','eps','min_lr', 'warmup_steps', 'update_lr_on_opt_step') - - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - + valid_args = ( + "lrsch_type", + "decay_rate", + "decay_steps", + "hold_steps", + "power", + "t", + "t_mul", + "warm_restarts", + "gamma", + "monitor", + "mode", + "factor", + "patience", + "threshold", + "threshold_mode", + "cooldown", + "eps", + "min_lr", + "warmup_steps", + "update_lr_on_opt_step", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - - parser.add_argument('--lrsch-type', type=str.lower, - default='none', - choices=['none','exp_lr', 'invpow_lr', 'cos_lr', 'adamcos_lr', 'red_lr_on_plateau'], - help=('Learning rate schedulers: None, Exponential,' - 'Cosine Annealing, Cosine Annealing for Adam,' - 'Reduce on Plateau')) - - parser.add_argument('--decay-rate' , - default=1/100, type=float, - help=('LR decay rate in exp lr')) - parser.add_argument('--decay-steps' , - default=100, type=int, - help=('LR decay steps in exp lr')) - parser.add_argument('--power' , - default=0.5, type=float, - help=('power in inverse power lr')) - - parser.add_argument('--hold-steps' , - default=10, type=int, - help=('LR hold steps in exp lr')) - parser.add_argument('--t' , - default=10, type=int, - help=('Period in cos lr')) - parser.add_argument('--t-mul' , - default=1, type=int, - help=('Period multiplicator for each restart in cos lr')) - parser.add_argument('--gamma' , - default=1/100, type=float, - help=('LR decay rate for each restart in cos lr')) - - parser.add_argument('--warm-restarts', default=False, - action='store_true', - help=('Do warm restarts in cos lr')) - - parser.add_argument('--monitor', default='val_loss', - help=('Monitor metric to reduce lr')) - parser.add_argument('--mode', default='min', - choices =['min','max'], - help=('Monitor metric mode to reduce lr')) - - parser.add_argument('--factor' , - default=0.1, type=float, - help=('Factor by which the learning rate will be reduced on plateau')) - - parser.add_argument('--patience' , - default=10, type=int, - help=('Number of epochs with no improvement after which learning rate will be reduced')) - - parser.add_argument('--threshold' , - default=1e-4, type=float, - help=('Minimum metric improvement')) - - parser.add_argument('--threshold_mode', default='rel', - choices =['rel','abs'], - help=('Relative or absolute')) - - parser.add_argument('--cooldown' , - default=0, type=int, - help=('Number of epochs to wait before resuming normal operation after lr has been reduced')) - - parser.add_argument('--eps' , - default=1e-8, type=float, - help=('Minimum decay applied to lr')) - - parser.add_argument('--min-lr' , - default=0, type=float, - help=('Minimum lr')) - - parser.add_argument('--warmup-steps' , - default=0, type=int, - help=('Number of batches to warmup lr')) - - parser.add_argument('--update-lr-on-opt-step', default=False, - action='store_true', - help=('Update lr based on batch number instead of epoch number')) + parser = ArgumentParser(prog="") + + parser.add_argument( + "--lrsch-type", + type=str.lower, + default="none", + choices=[ + "none", + "exp_lr", + "invpow_lr", + "cos_lr", + "adamcos_lr", + "red_lr_on_plateau", + ], + help=( + "Learning rate schedulers: None, Exponential," + "Cosine Annealing, Cosine Annealing for Adam," + "Reduce on Plateau" + ), + ) + + parser.add_argument( + "--decay-rate", + default=1 / 100, + type=float, + help=("LR decay rate in exp lr"), + ) + parser.add_argument( + "--decay-steps", default=100, type=int, help=("LR decay steps in exp lr") + ) + parser.add_argument( + "--power", default=0.5, type=float, help=("power in inverse power lr") + ) + + parser.add_argument( + "--hold-steps", default=10, type=int, help=("LR hold steps in exp lr") + ) + parser.add_argument("--t", default=10, type=int, help=("Period in cos lr")) + parser.add_argument( + "--t-mul", + default=1, + type=int, + help=("Period multiplicator for each restart in cos lr"), + ) + parser.add_argument( + "--gamma", + default=1 / 100, + type=float, + help=("LR decay rate for each restart in cos lr"), + ) + + parser.add_argument( + "--warm-restarts", + default=False, + action="store_true", + help=("Do warm restarts in cos lr"), + ) + + parser.add_argument( + "--monitor", default="val_loss", help=("Monitor metric to reduce lr") + ) + parser.add_argument( + "--mode", + default="min", + choices=["min", "max"], + help=("Monitor metric mode to reduce lr"), + ) + + parser.add_argument( + "--factor", + default=0.1, + type=float, + help=("Factor by which the learning rate will be reduced on plateau"), + ) + + parser.add_argument( + "--patience", + default=10, + type=int, + help=( + "Number of epochs with no improvement after which learning rate will be reduced" + ), + ) + + parser.add_argument( + "--threshold", default=1e-4, type=float, help=("Minimum metric improvement") + ) + + parser.add_argument( + "--threshold_mode", + default="rel", + choices=["rel", "abs"], + help=("Relative or absolute"), + ) + + parser.add_argument( + "--cooldown", + default=0, + type=int, + help=( + "Number of epochs to wait before resuming normal operation after lr has been reduced" + ), + ) + + parser.add_argument( + "--eps", default=1e-8, type=float, help=("Minimum decay applied to lr") + ) + + parser.add_argument("--min-lr", default=0, type=float, help=("Minimum lr")) + + parser.add_argument( + "--warmup-steps", + default=0, + type=int, + help=("Number of batches to warmup lr"), + ) + + parser.add_argument( + "--update-lr-on-opt-step", + default=False, + action="store_true", + help=("Update lr based on batch number instead of epoch number"), + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='learning rate scheduler options') - - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='learning rate scheduler options') add_argparse_args = add_class_args diff --git a/hyperion/torch/lr_schedulers/invpow_lr.py b/hyperion/torch/lr_schedulers/invpow_lr.py index 14d02850..53aa28dc 100644 --- a/hyperion/torch/lr_schedulers/invpow_lr.py +++ b/hyperion/torch/lr_schedulers/invpow_lr.py @@ -4,34 +4,40 @@ """ - import torch from .lr_scheduler import LRScheduler + class InvPowLR(LRScheduler): - """inverse power learning rate scheduler. - """ - def __init__(self, optimizer, power=0.5, hold_steps=0, - min_lr=0, warmup_steps=0, - epoch=0, step=0, update_lr_on_opt_step=False): + """inverse power learning rate scheduler.""" + + def __init__( + self, + optimizer, + power=0.5, + hold_steps=0, + min_lr=0, + warmup_steps=0, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ): super(InvPowLR, self).__init__( - optimizer, min_lr, warmup_steps, - epoch, step, update_lr_on_opt_step) + optimizer, min_lr, warmup_steps, epoch, step, update_lr_on_opt_step + ) self.power = power self.hold_steps = max(hold_steps, self.warmup_steps) - def get_lr(self, step): if step < self.hold_steps: return self.base_lrs - x = step/self.hold_steps - return [max( - min_lr, - base_lr * x ** (-self.power)) - for base_lr, min_lr in zip(self.base_lrs, self.min_lrs)] - + x = step / self.hold_steps + return [ + max(min_lr, base_lr * x ** (-self.power)) + for base_lr, min_lr in zip(self.base_lrs, self.min_lrs) + ] def load_state_dict(self, state_dict): """Loads the schedulers state. @@ -41,6 +47,6 @@ def load_state_dict(self, state_dict): from a call to :meth:`state_dict`. """ # we only load step and epoch so we can change the scheduler params during training - self.step = state_dict['step'] - self.epoch = state_dict['epoch'] - #self.__dict__.update(state_dict) + self.step = state_dict["step"] + self.epoch = state_dict["epoch"] + # self.__dict__.update(state_dict) diff --git a/hyperion/torch/lr_schedulers/lr_scheduler.py b/hyperion/torch/lr_schedulers/lr_scheduler.py index 321f6c3c..319ea7a2 100644 --- a/hyperion/torch/lr_schedulers/lr_scheduler.py +++ b/hyperion/torch/lr_schedulers/lr_scheduler.py @@ -4,49 +4,59 @@ """ - - import torch import torch.optim as optim + class LRScheduler(object): - """Base class for learning rate schedulers - """ - def __init__(self, optimizer, min_lr=0, warmup_steps=0, - epoch=0, step=0, update_lr_on_opt_step=False): + """Base class for learning rate schedulers""" + + def __init__( + self, + optimizer, + min_lr=0, + warmup_steps=0, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ): if not isinstance(optimizer, optim.Optimizer): - raise TypeError('%s is not an Optimizer' % - (type(optimizer).__name__)) + raise TypeError("%s is not an Optimizer" % (type(optimizer).__name__)) self.optimizer = optimizer if isinstance(min_lr, list) or isinstance(min_lr, tuple): if len(min_lr) != len(optimizer.param_groups): - raise ValueError("expected {} min_lrs, got {}".format( - len(optimizer.param_groups), len(min_lr))) + raise ValueError( + "expected {} min_lrs, got {}".format( + len(optimizer.param_groups), len(min_lr) + ) + ) self.min_lrs = list(min_lr) else: self.min_lrs = [min_lr] * len(optimizer.param_groups) if epoch == 0: for group in optimizer.param_groups: - group.setdefault('initial_lr', group['lr']) + group.setdefault("initial_lr", group["lr"]) else: for i, group in enumerate(optimizer.param_groups): - if 'initial_lr' not in group: - raise KeyError("param 'initial_lr' is not specified " - "in param_groups[{}] when resuming an optimizer".format(i)) - - self.base_lrs = list(map(lambda group: group['initial_lr'], optimizer.param_groups)) + if "initial_lr" not in group: + raise KeyError( + "param 'initial_lr' is not specified " + "in param_groups[{}] when resuming an optimizer".format(i) + ) + + self.base_lrs = list( + map(lambda group: group["initial_lr"], optimizer.param_groups) + ) self.warmup_steps = warmup_steps self.epoch = epoch self.step = step self.update_lr_on_opt_step = update_lr_on_opt_step - @property def in_warmup(self): return self.step <= self.warmup_steps - def state_dict(self): """Returns the state of the scheduler as a :class:`dict`. @@ -54,9 +64,10 @@ def state_dict(self): It contains an entry for every variable in self.__dict__ which is not the optimizer. """ - return {key: value for key, value in self.__dict__.items() if key != 'optimizer'} + return { + key: value for key, value in self.__dict__.items() if key != "optimizer" + } - def load_state_dict(self, state_dict): """Loads the schedulers state. @@ -66,17 +77,16 @@ def load_state_dict(self, state_dict): """ self.__dict__.update(state_dict) - def get_warmup_lr(self): x = self.step - return [(base_lr - min_lr)/self.warmup_steps*x + min_lr - for base_lr, min_lr in zip(self.base_lrs, self.min_lrs)] - + return [ + (base_lr - min_lr) / self.warmup_steps * x + min_lr + for base_lr, min_lr in zip(self.base_lrs, self.min_lrs) + ] def get_lr(self): raise NotImplementedError - def on_epoch_begin(self, epoch=None, **kwargs): if epoch is not None: self.epoch = epoch @@ -84,29 +94,31 @@ def on_epoch_begin(self, epoch=None, **kwargs): if self.update_lr_on_opt_step: return - for param_group, lr in zip(self.optimizer.param_groups, self.get_lr(self.epoch)): - param_group['lr'] = lr + for param_group, lr in zip( + self.optimizer.param_groups, self.get_lr(self.epoch) + ): + param_group["lr"] = lr - def on_epoch_end(self, metrics=None): self.epoch += 1 - def on_opt_step(self): - #self.update_lr_on_opt_step=True - #print('exp-lr', self.last_step, self.hold_steps, self.decay_rate, self.decay_steps) + # self.update_lr_on_opt_step=True + # print('exp-lr', self.last_step, self.hold_steps, self.decay_rate, self.decay_steps) if self.in_warmup: - for param_group, lr in zip(self.optimizer.param_groups, self.get_warmup_lr()): - param_group['lr'] = lr + for param_group, lr in zip( + self.optimizer.param_groups, self.get_warmup_lr() + ): + param_group["lr"] = lr self.step += 1 return - + if self.update_lr_on_opt_step: - for param_group, lr in zip(self.optimizer.param_groups, self.get_lr(self.step)): - param_group['lr'] = lr + for param_group, lr in zip( + self.optimizer.param_groups, self.get_lr(self.step) + ): + param_group["lr"] = lr self.step += 1 - - diff --git a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py index e9f123ac..8d9eb4bf 100644 --- a/hyperion/torch/lr_schedulers/red_lr_on_plateau.py +++ b/hyperion/torch/lr_schedulers/red_lr_on_plateau.py @@ -14,51 +14,67 @@ class ReduceLROnPlateau(LRScheduler): """Reduce learning rate when a metric has stopped improving. - Models often benefit from reducing the learning rate by a factor - of 2-10 once learning stagnates. This scheduler reads a metrics - quantity and if no improvement is seen for a 'patience' number - of epochs, the learning rate is reduced. - - Attributes: - optimizer (Optimizer): optimizer. - mode (str): One of `min`, `max`. In `min` mode, lr will - be reduced when the quantity monitored has stopped - decreasing; in `max` mode it will be reduced when the - quantity monitored has stopped increasing. Default: 'min'. - factor (float): Factor by which the learning rate will be - reduced. new_lr = lr * factor. Default: 0.1. - patience (int): Number of epochs with no improvement after - which learning rate will be reduced. For example, if - `patience = 2`, then we will ignore the first 2 epochs - with no improvement, and will only decrease the LR after the - 3rd epoch if the loss still hasn't improved then. - Default: 10. - threshold (float): Threshold for measuring the new optimum, - to only focus on significant changes. Default: 1e-4. - threshold_mode (str): One of `rel`, `abs`. In `rel` mode, - dynamic_threshold = best * ( 1 + threshold ) in 'max' - mode or best * ( 1 - threshold ) in `min` mode. - In `abs` mode, dynamic_threshold = best + threshold in - `max` mode or best - threshold in `min` mode. Default: 'rel'. - cooldown (int): Number of epochs to wait before resuming - normal operation after lr has been reduced. Default: 0. - min_lr (float or list): A scalar or a list of scalars. A - lower bound on the learning rate of all param groups - or each group respectively. Default: 0. - eps (float): Minimal decay applied to lr. If the difference - between new and old lr is smaller than eps, the update is - ignored. Default: 1e-8. + Models often benefit from reducing the learning rate by a factor + of 2-10 once learning stagnates. This scheduler reads a metrics + quantity and if no improvement is seen for a 'patience' number + of epochs, the learning rate is reduced. + + Attributes: + optimizer (Optimizer): optimizer. + mode (str): One of `min`, `max`. In `min` mode, lr will + be reduced when the quantity monitored has stopped + decreasing; in `max` mode it will be reduced when the + quantity monitored has stopped increasing. Default: 'min'. + factor (float): Factor by which the learning rate will be + reduced. new_lr = lr * factor. Default: 0.1. + patience (int): Number of epochs with no improvement after + which learning rate will be reduced. For example, if + `patience = 2`, then we will ignore the first 2 epochs + with no improvement, and will only decrease the LR after the + 3rd epoch if the loss still hasn't improved then. + Default: 10. + threshold (float): Threshold for measuring the new optimum, + to only focus on significant changes. Default: 1e-4. + threshold_mode (str): One of `rel`, `abs`. In `rel` mode, + dynamic_threshold = best * ( 1 + threshold ) in 'max' + mode or best * ( 1 - threshold ) in `min` mode. + In `abs` mode, dynamic_threshold = best + threshold in + `max` mode or best - threshold in `min` mode. Default: 'rel'. + cooldown (int): Number of epochs to wait before resuming + normal operation after lr has been reduced. Default: 0. + min_lr (float or list): A scalar or a list of scalars. A + lower bound on the learning rate of all param groups + or each group respectively. Default: 0. + eps (float): Minimal decay applied to lr. If the difference + between new and old lr is smaller than eps, the update is + ignored. Default: 1e-8. """ - def __init__(self, optimizer, monitor='val_loss', mode='min', - factor=0.1, patience=10, - threshold=1e-4, threshold_mode='rel', - cooldown=0, min_lr=0, warmup_steps=0, eps=1e-8): + + def __init__( + self, + optimizer, + monitor="val_loss", + mode="min", + factor=0.1, + patience=10, + threshold=1e-4, + threshold_mode="rel", + cooldown=0, + min_lr=0, + warmup_steps=0, + eps=1e-8, + ): super(ReduceLROnPlateau, self).__init__( - optimizer, min_lr, warmup_steps, - epoch=0, step=0, update_lr_on_opt_step=False) + optimizer, + min_lr, + warmup_steps, + epoch=0, + step=0, + update_lr_on_opt_step=False, + ) if factor >= 1.0: - raise ValueError('Factor should be < 1.0.') + raise ValueError("Factor should be < 1.0.") self.factor = factor self.monitor = monitor @@ -73,31 +89,30 @@ def __init__(self, optimizer, monitor='val_loss', mode='min', self.mode_worse = None # the worse value for the chosen mode self.is_better = None self.eps = eps - self._init_is_better(mode=mode, threshold=threshold, - threshold_mode=threshold_mode) + self._init_is_better( + mode=mode, threshold=threshold, threshold_mode=threshold_mode + ) self._reset() - def _reset(self): """Resets num_bad_epochs counter and cooldown counter.""" self.best = self.mode_worse self.cooldown_counter = 0 self.num_bad_epochs = 0 - def on_opt_step(self): self.step = self.step + 1 if self.in_warmup: - for param_group, lr in zip(self.optimizer.param_groups, self.get_warmup_lr()): - param_group['lr'] = lr + for param_group, lr in zip( + self.optimizer.param_groups, self.get_warmup_lr() + ): + param_group["lr"] = lr return - def on_epoch_begin(self, epoch=None): if epoch is not None: self.epoch = epoch - - + def on_epoch_end(self, metrics=None): current = metrics[self.monitor] if self.is_better(current, self.best): @@ -117,51 +132,51 @@ def on_epoch_end(self, metrics=None): self.epoch += 1 - def _reduce_lr(self, epoch): for i, param_group in enumerate(self.optimizer.param_groups): - old_lr = float(param_group['lr']) + old_lr = float(param_group["lr"]) new_lr = max(old_lr * self.factor, self.min_lrs[i]) if old_lr - new_lr > self.eps: - param_group['lr'] = new_lr - logging.info('Epoch {:5d}: reducing learning rate' - ' of group {} to {:.4e}.'.format(epoch, i, new_lr)) + param_group["lr"] = new_lr + logging.info( + "Epoch {:5d}: reducing learning rate" + " of group {} to {:.4e}.".format(epoch, i, new_lr) + ) @property def in_cooldown(self): return self.cooldown_counter > 0 def _cmp(self, mode, threshold_mode, threshold, a, best): - if mode == 'min' and threshold_mode == 'rel': - rel_epsilon = 1. - threshold + if mode == "min" and threshold_mode == "rel": + rel_epsilon = 1.0 - threshold return a < best * rel_epsilon - elif mode == 'min' and threshold_mode == 'abs': + elif mode == "min" and threshold_mode == "abs": return a < best - threshold - elif mode == 'max' and threshold_mode == 'rel': - rel_epsilon = threshold + 1. + elif mode == "max" and threshold_mode == "rel": + rel_epsilon = threshold + 1.0 return a > best * rel_epsilon else: # mode == 'max' and epsilon_mode == 'abs': return a > best + threshold def _init_is_better(self, mode, threshold, threshold_mode): - if mode not in {'min', 'max'}: - raise ValueError('mode ' + mode + ' is unknown!') - if threshold_mode not in {'rel', 'abs'}: - raise ValueError('threshold mode ' + threshold_mode + ' is unknown!') + if mode not in {"min", "max"}: + raise ValueError("mode " + mode + " is unknown!") + if threshold_mode not in {"rel", "abs"}: + raise ValueError("threshold mode " + threshold_mode + " is unknown!") - if mode == 'min': + if mode == "min": self.mode_worse = inf else: # mode == 'max': self.mode_worse = -inf self.is_better = partial(self._cmp, mode, threshold_mode, threshold) - - def load_state_dict(self, state_dict): self.__dict__.update(state_dict) - self._init_is_better(mode=self.mode, threshold=self.threshold, - threshold_mode=self.threshold_mode) + self._init_is_better( + mode=self.mode, threshold=self.threshold, threshold_mode=self.threshold_mode + ) diff --git a/hyperion/torch/metrics/__init__.py b/hyperion/torch/metrics/__init__.py index d09d353c..b4a2eaac 100644 --- a/hyperion/torch/metrics/__init__.py +++ b/hyperion/torch/metrics/__init__.py @@ -4,8 +4,6 @@ """ - from .metrics import TorchMetric from .accuracy_functional import * from .accuracy import * - diff --git a/hyperion/torch/metrics/accuracy.py b/hyperion/torch/metrics/accuracy.py index 5025197e..ebd02e32 100644 --- a/hyperion/torch/metrics/accuracy.py +++ b/hyperion/torch/metrics/accuracy.py @@ -8,45 +8,36 @@ from .metrics import TorchMetric from .accuracy_functional import * -class CategoricalAccuracy(TorchMetric): - - def __init__(self, weight=None, reduction='mean'): - super(CategoricalAccuracy, self).__init__( - weight=weight, reduction=reduction) +class CategoricalAccuracy(TorchMetric): + def __init__(self, weight=None, reduction="mean"): + super(CategoricalAccuracy, self).__init__(weight=weight, reduction=reduction) def forward(self, input, target): - return categorical_accuracy(input, target, weight=self.weight, - reduction=self.reduction) + return categorical_accuracy( + input, target, weight=self.weight, reduction=self.reduction + ) class BinaryAccuracy(TorchMetric): - - def __init__(self, weight=None, reduction='mean', thr=0.5): - super(BinaryAccuracy, self).__init__( - weight=weight, reduction=reduction) + def __init__(self, weight=None, reduction="mean", thr=0.5): + super(BinaryAccuracy, self).__init__(weight=weight, reduction=reduction) self.thr = thr - def forward(self, input, target): - return binary_accuracy(input, target, weight=self.weight, - reduction=self.reduction, thr=self.thr) - + return binary_accuracy( + input, target, weight=self.weight, reduction=self.reduction, thr=self.thr + ) class BinaryAccuracyWithLogits(TorchMetric): - - def __init__(self, weight=None, reduction='mean', thr=0.0): + def __init__(self, weight=None, reduction="mean", thr=0.0): super(BinaryAccuracyWithLogits, self).__init__( - weight=weight, reduction=reduction) + weight=weight, reduction=reduction + ) self.thr = thr - def forward(self, input, target): - return binary_accuracy_with_logits(input, target, weight=self.weight, - reduction=self.reduction, thr=self.thr) - - - - - + return binary_accuracy_with_logits( + input, target, weight=self.weight, reduction=self.reduction, thr=self.thr + ) diff --git a/hyperion/torch/metrics/accuracy_functional.py b/hyperion/torch/metrics/accuracy_functional.py index d1fdf4da..aeb384c9 100644 --- a/hyperion/torch/metrics/accuracy_functional.py +++ b/hyperion/torch/metrics/accuracy_functional.py @@ -4,73 +4,83 @@ """ - import torch -def categorical_accuracy(input, target, weight=None, reduction='mean'): +def categorical_accuracy(input, target, weight=None, reduction="mean"): dim = input.dim() if dim < 2: - raise ValueError('Expected 2 or more dimensions (got %d)' % (dim)) + raise ValueError("Expected 2 or more dimensions (got %d)" % (dim)) if input.size(0) != target.size(0): - raise ValueError('Expected input batch_size (%d) to match target batch_size (%d).' - % (input.size(0), target.size(0))) + raise ValueError( + "Expected input batch_size (%d) to match target batch_size (%d)." + % (input.size(0), target.size(0)) + ) with torch.no_grad(): _, pred = torch.max(input, dim=-1) if target.dim() == 2: _, target = torch.max(target, dim=-1) - + ok = pred.eq(target).float() - if reduction == 'none': + if reduction == "none": return ok weight_mean = 1 if weight is not None: if input.size(0) != weight.size(0): - raise ValueError('Expected input batch_size (%d) to match weight batch_size (%d).' - % (input.size(0), weight.size(0))) + raise ValueError( + "Expected input batch_size (%d) to match weight batch_size (%d)." + % (input.size(0), weight.size(0)) + ) ok *= weight weight_mean = weight.mean() - if reduction == 'sum': + if reduction == "sum": return ok.sum().item() - acc = ok.mean()/weight_mean + acc = ok.mean() / weight_mean return acc.item() - -def binary_accuracy(input, target, weight=None, reduction='mean', thr=0.5): +def binary_accuracy(input, target, weight=None, reduction="mean", thr=0.5): dim = input.dim() if dim < 2: - raise ValueError('Expected 2 or more dimensions (got %d)' % (dim)) + raise ValueError("Expected 2 or more dimensions (got %d)" % (dim)) if not (target.size() == input.size()): - raise ValueError("Target size ({}) is different to the input size ({}).".format(target.size(), input.size())) + raise ValueError( + "Target size ({}) is different to the input size ({}).".format( + target.size(), input.size() + ) + ) if input.numel() != target.numel(): - raise ValueError("Target and input must have the same number of elements. target nelement ({}) " - "!= input nelement ({})".format(target.numel(), input.numel())) - + raise ValueError( + "Target and input must have the same number of elements. target nelement ({}) " + "!= input nelement ({})".format(target.numel(), input.numel()) + ) + with torch.no_grad(): pred = input > thr ok = pred.eq(target).float() - if reduction == 'none': + if reduction == "none": return ok weight_mean = 1 if weight is not None: if input.size(0) != weight.size(0): - raise ValueError('Expected input batch_size (%d) to match weight batch_size (%d).' - % (input.size(0), weight.size(0))) + raise ValueError( + "Expected input batch_size (%d) to match weight batch_size (%d)." + % (input.size(0), weight.size(0)) + ) if weight.dim() == 1: ok *= weight.unsqueeze(1) @@ -79,23 +89,13 @@ def binary_accuracy(input, target, weight=None, reduction='mean', thr=0.5): weight_mean = weight.mean() - - if reduction == 'sum': + if reduction == "sum": return ok.sum().item() - acc = ok.mean()/weight_mean + acc = ok.mean() / weight_mean return ok.item() - -def binary_accuracy_with_logits(input, target, weight=None, reduction='mean', thr=0): +def binary_accuracy_with_logits(input, target, weight=None, reduction="mean", thr=0): return binary_accuracy(input, target, weight, reduction, thr) - - - - - - - - diff --git a/hyperion/torch/metrics/metrics.py b/hyperion/torch/metrics/metrics.py index d5431a10..62b9769c 100644 --- a/hyperion/torch/metrics/metrics.py +++ b/hyperion/torch/metrics/metrics.py @@ -8,12 +8,11 @@ class TorchMetric(nn.Module): - """Base class for metrics that cannot be - objective functions + """Base class for metrics that cannot be + objective functions """ - def __init__(self, weight=None, reduction='mean'): + + def __init__(self, weight=None, reduction="mean"): super().__init__() self.weight = weight self.reduction = reduction - - diff --git a/hyperion/torch/models/tvector/__init__.py b/hyperion/torch/models/tvector/__init__.py index b9a7e45f..98db2561 100644 --- a/hyperion/torch/models/tvector/__init__.py +++ b/hyperion/torch/models/tvector/__init__.py @@ -6,5 +6,3 @@ # t-vectors from .tvector import TVector from .resnet_tvector import ResNetTVector - - diff --git a/hyperion/torch/models/tvector/resnet_tvector.py b/hyperion/torch/models/tvector/resnet_tvector.py index 8d4e200a..d74272aa 100644 --- a/hyperion/torch/models/tvector/resnet_tvector.py +++ b/hyperion/torch/models/tvector/resnet_tvector.py @@ -14,61 +14,91 @@ class ResNetXVector(XVector): - - def __init__(self, - in_feats, num_classes, - resnet_cfg=Namespace( - resnet_type='resnet34', - in_channels=1, - conv_channels=64, base_channels=64, - in_kernel_size=7, in_stride=1, - zero_init_residual=False, - groups=1, replace_stride_with_dilation=None, - do_maxpool=False, - hid_act={'name':'relu', 'inplace':True}, - dropout_rate=0, - norm_layer=None, - use_norm=True, - norm_before=True, - in_norm=False, - se_r=16, res2net_scale=4, res2net_width_factor=1), - conformer_cfg=Namespace( - d_model=256, num_heads=4, num_blocks=6, - attype='scaled-dot-prod-v1', atcontext=25, - conv_repeats=1, conv_kernel_sizes=31, conv_strides=1, - ff_type='linear', d_ff=2048, ff_kernel_size=1, - dropourate=0.1, pos_dropourate=0.1, att_dropout_rate=0.0, - in_layer_type='conv2d-sub', - rel_pos_enc=True, causal_pos_enc=False, no_pos_enc=False, - hid_act='swish', - conv_norm_layer=None, se_r=None, - ff_macaron=True, red_lnorms=False, concat_after=False), - pool_net='mean+stddev', - head_cfg=Namespace( - embed_dim=256, - num_embed_layers=1, - head_hid_act={'name':'relu', 'inplace': True}, - loss_type='arc-softmax', - s=64, margin=0.3, margin_warmup_epochs=0, - num_subcenters=2, - norm_layer=None, - use_norm=True, norm_before=True, - dropout_rate=0, - embed_layer=0), + def __init__( + self, + in_feats, + num_classes, + resnet_cfg=Namespace( + resnet_type="resnet34", + in_channels=1, + conv_channels=64, + base_channels=64, + in_kernel_size=7, + in_stride=1, + zero_init_residual=False, + groups=1, + replace_stride_with_dilation=None, + do_maxpool=False, + hid_act={"name": "relu", "inplace": True}, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=False, + se_r=16, + res2net_scale=4, + res2net_width_factor=1, + ), + conformer_cfg=Namespace( + d_model=256, + num_heads=4, + num_blocks=6, + attype="scaled-dot-prod-v1", + atcontext=25, + conv_repeats=1, + conv_kernel_sizes=31, + conv_strides=1, + ff_type="linear", + d_ff=2048, + ff_kernel_size=1, + dropourate=0.1, + pos_dropourate=0.1, + att_dropout_rate=0.0, + in_layer_type="conv2d-sub", + rel_pos_enc=True, + causal_pos_enc=False, + no_pos_enc=False, + hid_act="swish", + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + red_lnorms=False, + concat_after=False, + ), + pool_net="mean+stddev", + head_cfg=Namespace( + embed_dim=256, + num_embed_layers=1, + head_hid_act={"name": "relu", "inplace": True}, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + norm_layer=None, + use_norm=True, + norm_before=True, + dropout_rate=0, + embed_layer=0, + ), ): - - logging.info('making %s encoder network' % (resnet_type)) + + logging.info("making %s encoder network" % (resnet_type)) if isinstance(resnet_cfg, Namespace): resnet_cfg = var(resnet_cfg) - self.resnet_type = resnet_cfg['resnet_type'] + self.resnet_type = resnet_cfg["resnet_type"] encoder_net = RNF.create(**resnet_cfg) - + super().__init__( - encoder_net, num_classes, conformer_cfg=conformer_cfg, pool_net=pool_net, + encoder_net, + num_classes, + conformer_cfg=conformer_cfg, + pool_net=pool_net, head_cfg=head_cfg, - in_feats=in_feats, proj_feats=None) - + in_feats=in_feats, + proj_feats=None, + ) @property def in_channels(self): @@ -125,31 +155,28 @@ def res2net_width_factor(self): def get_config(self): base_config = super().get_config() - del base_config['encoder_cfg'] + del base_config["encoder_cfg"] enc_cfg = self.encoder_net.get_config() - del enc_cfg['block'] - del enc_cfg['out_units'] - del enc_cfg['out_act'] - enc_cfg['resnet_type'] = self.resnet_type - - base_config['resnet_cfg'] = enc_cfg + del enc_cfg["block"] + del enc_cfg["out_units"] + del enc_cfg["out_act"] + enc_cfg["resnet_type"] = self.resnet_type - return base_config + base_config["resnet_cfg"] = enc_cfg + return base_config @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - - cfg, state_dict = cls._load_cfg_state_dict( - file_path, cfg, state_dict) - model = cls(**cfg) + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) + + model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) return model - def filter_args(prefix=None, **kwargs): base_args = XVector.filter_args(prefix, **kwargs) @@ -158,15 +185,12 @@ def filter_args(prefix=None, **kwargs): base_args.update(child_args) return base_args - @staticmethod def add_argparse_args(parser, prefix=None): - + XVector.add_argparse_args(parser, prefix) if prefix is None: - prefix = 'resnet' + prefix = "resnet" else: - prefix = prefix '-resnet' + prefix = prefix + "-resnet" RNF.add_argparse_args(parser, prefix) - - diff --git a/hyperion/torch/models/tvector/tvector.py b/hyperion/torch/models/tvector/tvector.py index 890eae4e..8a3758fb 100644 --- a/hyperion/torch/models/tvector/tvector.py +++ b/hyperion/torch/models/tvector/tvector.py @@ -12,21 +12,22 @@ from ..layers import GlobalPool1dFactory as PF from ..layer_blocks import TDNNBlock from ...narchs import ClassifHead, ConformerEncoderV1, TorchNALoader -from ..narchs import ClassifHead, from ..torch_model import TorchModel from ..utils import eval_nnet_by_chunks class TXVector(TorchModel): - """x-Vector base class - """ - def __init__(self, - encoder_net, - num_classes, - conformer_net={}, - pool_net='mean+stddev', - classif_net={}, - in_feats=None): + """x-Vector base class""" + + def __init__( + self, + encoder_net, + num_classes, + conformer_net={}, + pool_net="mean+stddev", + classif_net={}, + in_feats=None, + ): super().__init__() @@ -42,7 +43,9 @@ def __init__(self, enc_feats = out_shape[1] elif len(in_shape) == 4: # encoder based in 2d convs - assert in_feats is not None, 'in_feats dimension must be given to calculate pooling dimension' + assert ( + in_feats is not None + ), "in_feats dimension must be given to calculate pooling dimension" in_shape = list(in_shape) in_shape[2] = in_feats out_shape = self.encoder_net.out_shape(tuple(in_shape)) @@ -50,30 +53,29 @@ def __init__(self, self.in_feats = in_feats - logging.info('encoder input shape={}'.format(in_shape)) - logging.info('encoder output shape={}'.format(out_shape)) + logging.info("encoder input shape={}".format(in_shape)) + logging.info("encoder output shape={}".format(out_shape)) # create conformer net if isinstance(conformer_net, nn.Module): self.conformer_net = conformer_net else: - logging.info('making conformer net') - conformer_net['in_layer_type'] = 'linear' - self.conformer_net = ConformerEncoderV1(enc_feats, - in_time_dim=1, - out_time_dim=1, - **conformer_net) + logging.info("making conformer net") + conformer_net["in_layer_type"] = "linear" + self.conformer_net = ConformerEncoderV1( + enc_feats, in_time_dim=1, out_time_dim=1, **conformer_net + ) d_model = self.conformer_net.d_model self.pool_net = self._make_pool_net(pool_cfg, d_model) pool_feats = int(d_model * self.pool_net.size_multiplier) - logging.info('infer pooling dimension %d', pool_feats) + logging.info("infer pooling dimension %d", pool_feats) # create classification head if isinstance(classif_net, nn.Module): self.classif_net = classif_net else: - logging.info('making classification head net') + logging.info("making classification head net") self.classif_net = ClassifHead(pool_feats, num_classes, **head_cfg) @property @@ -113,8 +115,8 @@ def loss_type(self): return self.classif_net.loss_type def _make_pool_net(self, pool_net, enc_feats=None): - """ Makes the pooling block - + """Makes the pooling block + Args: pool_net: str or dict to pass to the pooling factory create function enc_feats: dimension of the features coming from the encoder @@ -123,17 +125,17 @@ def _make_pool_net(self, pool_net, enc_feats=None): GlobalPool1d object """ if isinstance(pool_net, str): - pool_net = {'pool_type': pool_net} + pool_net = {"pool_type": pool_net} if isinstance(pool_net, dict): if enc_feats is not None: - pool_net['in_feats'] = enc_feats + pool_net["in_feats"] = enc_feats return PF.create(**pool_net) elif isinstance(pool_net, nn.Module): return pool_net else: - raise Exception('Invalid pool_net argument') + raise Exception("Invalid pool_net argument") def update_loss_margin(self, epoch): """Updates the value of the margin in AAM/AM-softmax losses @@ -158,32 +160,33 @@ def _post_enc(self, x): return x - def forward(self, - x, - y=None, - enc_layers=None, - classif_layers=None, - return_output=True, - use_amp=False): + def forward( + self, + x, + y=None, + enc_layers=None, + classif_layers=None, + return_output=True, + use_amp=False, + ): if enc_layers is None and classif_layers is None: return self.forward_output(x, y) - h = self.forward_hid_feats(x, y, enc_layers, classif_layers, - return_output) + h = self.forward_hid_feats(x, y, enc_layers, classif_layers, return_output) output = {} if enc_layers is not None: if classif_layers is None: - output['h_enc'] = h + output["h_enc"] = h else: - output['h_enc'] = h[0] + output["h_enc"] = h[0] else: - output['h_enc'] = [] + output["h_enc"] = [] if classif_layers is not None: - output['h_classif'] = h[1] + output["h_classif"] = h[1] else: - output['h_classif'] = [] + output["h_classif"] = [] if return_output: - output['output'] = h[2] + output["output"] = h[2] return output def forward_output(self, x, y=None): @@ -192,7 +195,7 @@ def forward_output(self, x, y=None): Args: x: input features tensor with shape=(batch, in_feats, time) y: target classes torch.long tensor with shape=(batch,) - + Returns: class posteriors tensor with shape=(batch, num_classes) """ @@ -209,26 +212,25 @@ class posteriors tensor with shape=(batch, num_classes) y = self.classif_net(p, y) return y - def forward_hid_feats(self, - x, - y=None, - enc_layers=None, - conf_layers=None, - classif_layers=None, - return_output=False): - """forwards hidden representations in the x-vector network - """ + def forward_hid_feats( + self, + x, + y=None, + enc_layers=None, + conf_layers=None, + classif_layers=None, + return_output=False, + ): + """forwards hidden representations in the x-vector network""" if self.encoder_net.in_dim() == 4 and x.dim() == 3: x = x.view(x.size(0), 1, x.size(1), x.size(2)) - h_enc, x = self.encoder_net.forward_hid_feats(x, - enc_layers, - return_output=True) + h_enc, x = self.encoder_net.forward_hid_feats(x, enc_layers, return_output=True) - h_conf, x = self.conformer_net.forward_hid_feats(x, - conf_layers, - return_output=True) + h_conf, x = self.conformer_net.forward_hid_feats( + x, conf_layers, return_output=True + ) if not return_output and classif_layers is None: return h_enc @@ -241,28 +243,24 @@ def forward_hid_feats(self, p = self.pool_net(x) h_classif = self.classif_net.forward_hid_feats( - p, y, classif_layers, return_output=return_output) + p, y, classif_layers, return_output=return_output + ) if return_output: h_classif, y = h_classif return h_enc, h_classif, y return h_enc, h_classif - def extract_embed(self, - x, - chunk_length=0, - embed_layer=None, - detach_chunks=False): + def extract_embed(self, x, chunk_length=0, embed_layer=None, detach_chunks=False): if embed_layer is None: embed_layer = self.embed_layer x = self._pre_enc(x) # if self.encoder_net.in_dim() == 4 and x.dim() == 3: # x = x.view(x.size(0), 1, x.size(1), x.size(2)) - x = eval_nnet_by_chunks(x, - self.encoder_net, - chunk_length, - detach_chunks=detach_chunks) + x = eval_nnet_by_chunks( + x, self.encoder_net, chunk_length, detach_chunks=detach_chunks + ) if x.device != self.device: x = x.to(self.device) @@ -279,38 +277,40 @@ def extract_embed(self, y = self.classif_net.extract_embed(p, embed_layer) return y - def extract_embed_slidwin(self, - x, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=None, - feat_frame_shift=None, - chunk_length=0, - embed_layer=None, - detach_chunks=False): + def extract_embed_slidwin( + self, + x, + win_length, + win_shift, + snip_edges=False, + feat_frame_length=None, + feat_frame_shift=None, + chunk_length=0, + embed_layer=None, + detach_chunks=False, + ): if feat_frame_shift is not None: - #assume win_length/shift are in secs, transform to frames + # assume win_length/shift are in secs, transform to frames # pass feat times from msecs to secs feat_frame_shift = feat_frame_shift / 1000 feat_frame_length = feat_frame_length / 1000 # get length and shift in number of feature frames win_shift = win_shift / feat_frame_shift # this can be a float - win_length = (win_length - feat_frame_length + - feat_frame_shift) / feat_frame_shift - assert win_shift > 0.5, 'win-length should be longer than feat-frame-length' + win_length = ( + win_length - feat_frame_length + feat_frame_shift + ) / feat_frame_shift + assert win_shift > 0.5, "win-length should be longer than feat-frame-length" if embed_layer is None: embed_layer = self.embed_layer in_time = x.size(-1) x = self._pre_enc(x) - x = eval_nnet_by_chunks(x, - self.encoder_net, - chunk_length, - detach_chunks=detach_chunks) + x = eval_nnet_by_chunks( + x, self.encoder_net, chunk_length, detach_chunks=detach_chunks + ) if x.device != self.device: x = x.to(self.device) @@ -318,44 +318,65 @@ def extract_embed_slidwin(self, x = self._post_enc(x) pin_time = x.size(-1) # time dim before pooling downsample_factor = float(pin_time) / in_time - p = self.pool_net.forward_slidwin(x, - downsample_factor * win_length, - downsample_factor * win_shift, - snip_edges=snip_edges) + p = self.pool_net.forward_slidwin( + x, + downsample_factor * win_length, + downsample_factor * win_shift, + snip_edges=snip_edges, + ) # (batch, pool_dim, time) p = p.transpose(1, 2).contiguous().view(-1, p.size(1)) - y = self.classif_net.extract_embed(p, embed_layer).view( - x.size(0), -1, self.embed_dim).transpose(1, 2).contiguous() + y = ( + self.classif_net.extract_embed(p, embed_layer) + .view(x.size(0), -1, self.embed_dim) + .transpose(1, 2) + .contiguous() + ) return y - def compute_slidwin_timestamps(self, - num_windows, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=25, - feat_frame_shift=10, - feat_snip_edges=False): - - P = self.compute_slidwin_left_padding(win_length, win_shift, - snip_edges, feat_frame_length, - feat_frame_shift, - feat_snip_edges) - - tstamps = torch.as_tensor([[i * win_shift, i * win_shift + win_length] - for i in range(num_windows)]) - P + def compute_slidwin_timestamps( + self, + num_windows, + win_length, + win_shift, + snip_edges=False, + feat_frame_length=25, + feat_frame_shift=10, + feat_snip_edges=False, + ): + + P = self.compute_slidwin_left_padding( + win_length, + win_shift, + snip_edges, + feat_frame_length, + feat_frame_shift, + feat_snip_edges, + ) + + tstamps = ( + torch.as_tensor( + [ + [i * win_shift, i * win_shift + win_length] + for i in range(num_windows) + ] + ) + - P + ) tstamps[tstamps < 0] = 0 return tstamps - def compute_slidwin_left_padding(self, - win_length, - win_shift, - snip_edges=False, - feat_frame_length=25, - feat_frame_shift=10, - feat_snip_edges=False): + def compute_slidwin_left_padding( + self, + win_length, + win_shift, + snip_edges=False, + feat_frame_length=25, + feat_frame_shift=10, + feat_snip_edges=False, + ): # pass feat times from msecs to secs feat_frame_shift = feat_frame_shift / 1000 @@ -363,9 +384,8 @@ def compute_slidwin_left_padding(self, # get length and shift in number of feature frames H = win_shift / feat_frame_shift - L = (win_length - feat_frame_length + - feat_frame_shift) / feat_frame_shift - assert L > 0.5, 'win-length should be longer than feat-frame-length' + L = (win_length - feat_frame_length + feat_frame_shift) / feat_frame_shift + assert L > 0.5, "win-length should be longer than feat-frame-length" # compute left padding in case of snip_edges is False if snip_edges: @@ -374,7 +394,9 @@ def compute_slidwin_left_padding(self, Q = ( L - H ) / 2 # left padding in frames introduced by x-vector sliding window - P1 = Q * feat_frame_shift # left padding in secs introduced by x-vector sliding window + P1 = ( + Q * feat_frame_shift + ) # left padding in secs introduced by x-vector sliding window if feat_snip_edges: # left padding introduced when computing acoustic feats @@ -393,12 +415,12 @@ def get_config(self): classif_cfg = self.classif_net.get_config() config = { - 'encoder_cfg': enc_cfg, - 'num_classes': self.num_classes, - 'conformer_net': self.conformer_cfg, - 'pool_net': pool_cfg, - 'classif_net': self.classif_cfg, - 'in_feats': self.in_feats + "encoder_cfg": enc_cfg, + "num_classes": self.num_classes, + "conformer_net": self.conformer_cfg, + "pool_net": pool_cfg, + "classif_net": self.classif_cfg, + "in_feats": self.in_feats, } base_config = super().get_config() @@ -407,9 +429,9 @@ def get_config(self): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - encoder_net = TorchNALoader.load_from_cfg(cfg=cfg['encoder_cfg']) + encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) - for k in ('encoder_cfg'): + for k in "encoder_cfg": del cfg[k] model = cls(encoder_net, **cfg) @@ -418,21 +440,25 @@ def load(cls, file_path=None, cfg=None, state_dict=None): return model - def rebuild_output_layer(self, - num_classes=None, - loss_type='arc-softmax', - s=64, - margin=0.3, - margin_warmup_epochs=10): - if (self.num_classes is not None and self.num_classes != num_classes - ) or (self.loss_type != loss_type): + def rebuild_output_layer( + self, + num_classes=None, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=10, + ): + if (self.num_classes is not None and self.num_classes != num_classes) or ( + self.loss_type != loss_type + ): # if we change the number of classes or the loss-type # we need to reinitiate the last layer - self.classif_net.rebuild_output_layer(num_classes, loss_type, s, - margin, margin_warmup_epochs) + self.classif_net.rebuild_output_layer( + num_classes, loss_type, s, margin, margin_warmup_epochs + ) return - #otherwise we just change the values of s, margin and margin_warmup + # otherwise we just change the values of s, margin and margin_warmup self.classif_net.set_margin(margin) self.classif_net.set_margin_warmup_epochs(margin_warmup_epochs) self.classif_net.set_s(s) @@ -448,8 +474,8 @@ def freeze_preembed_layers(self): layer_list = [l for l in range(self.embed_layer)] self.classif_net.freeze_layers(layer_list) - def train_mode(self, mode='ft-embed-affine'): - if mode == 'ft-full' or mode == 'train': + def train_mode(self, mode="ft-embed-affine"): + if mode == "ft-full" or mode == "train": self.train() return @@ -463,19 +489,18 @@ def train_mode(self, mode='ft-embed-affine'): @staticmethod def filter_args(**kwargs): - valid_args = ('num_classes', 'in_feats') + valid_args = ("num_classes", "in_feats") args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) # get arguments for conformer - conformer_args = ConformerEncoderV1.filter_args( - **kwargs['conformer_net']) - args['corformer_net'] = conformer_args + conformer_args = ConformerEncoderV1.filter_args(**kwargs["conformer_net"]) + args["corformer_net"] = conformer_args # get arguments for pooling - pool_args = PF.filter_args(**kwargs['pool_net']) - args['pool_net'] = pool_args + pool_args = PF.filter_args(**kwargs["pool_net"]) + args["pool_net"] = pool_args # get arguments for classif head - classif_args = ClassifHead.filter_args(**kwargs['classif_net']) - args['classif_net'] = classif_args + classif_args = ClassifHead.filter_args(**kwargs["classif_net"]) + args["classif_net"] = classif_args return args @@ -483,21 +508,23 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") - CoformerEncoderV1.add_class_args(parser, prefix='conformer_net') - PF.add_class_args(parser, - prefix='pool_net', - skip=['dim', 'in_feats', 'keepdim']) - ClassifHead.add_class_args(parser, prefix='classif_net') + CoformerEncoderV1.add_class_args(parser, prefix="conformer_net") + PF.add_class_args( + parser, prefix="pool_net", skip=["dim", "in_feats", "keepdim"] + ) + ClassifHead.add_class_args(parser, prefix="classif_net") if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser), - help='xvector options') + outer_parser.add_argument( + "--" + prefix, + action=ActionParser(parser=parser), + help="xvector options", + ) @staticmethod def filter_finetune_args(**kwargs): - valid_args = ('loss_type', 's', 'margin', 'margin_warmup_epochs') + valid_args = ("loss_type", "s", "margin", "margin_warmup_epochs") args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @@ -506,40 +533,34 @@ def filter_finetune_args(**kwargs): def add_finetune_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--loss-type', - default='arc-softmax', - choices=[ - 'softmax', 'arc-softmax', 'cos-softmax', - 'subcenter-arc-softmax' - ], - help= - 'loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax' + "--loss-type", + default="arc-softmax", + choices=["softmax", "arc-softmax", "cos-softmax", "subcenter-arc-softmax"], + help="loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax", ) - parser.add_argument('--s', - default=64, - type=float, - help='scale for arcface') + parser.add_argument("--s", default=64, type=float, help="scale for arcface") - parser.add_argument('--margin', - default=0.3, - type=float, - help='margin for arcface, cosface,...') + parser.add_argument( + "--margin", default=0.3, type=float, help="margin for arcface, cosface,..." + ) parser.add_argument( - '--margin-warmup-epochs', + "--margin-warmup-epochs", default=10, type=float, - help='number of epoch until we set the final margin') + help="number of epoch until we set the final margin", + ) - parser.add_argument('--num-subcenters', - default=2, - type=float, - help='number of subcenters in subcenter losses') + parser.add_argument( + "--num-subcenters", + default=2, + type=float, + help="number of subcenters in subcenter losses", + ) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) diff --git a/hyperion/torch/models/vae/vae.py b/hyperion/torch/models/vae/vae.py index 21ed2720..32239718 100644 --- a/hyperion/torch/models/vae/vae.py +++ b/hyperion/torch/models/vae/vae.py @@ -34,11 +34,20 @@ class VAE(TorchModel): data_scale = for future use """ - def __init__(self, encoder_net, decoder_net, z_dim, kldiv_weight=1, - qz_pdf='normal-glob-diag-cov', pz_pdf='std-normal', - px_pdf='normal-glob-diag-cov', - flatten_spatial=False, spatial_shape=None, - scale_invariant=False, data_scale=None): + def __init__( + self, + encoder_net, + decoder_net, + z_dim, + kldiv_weight=1, + qz_pdf="normal-glob-diag-cov", + pz_pdf="std-normal", + px_pdf="normal-glob-diag-cov", + flatten_spatial=False, + spatial_shape=None, + scale_invariant=False, + data_scale=None, + ): super().__init__() self.encoder_net = encoder_net self.decoder_net = decoder_net @@ -54,9 +63,9 @@ def __init__(self, encoder_net, decoder_net, z_dim, kldiv_weight=1, # infer input feat dimension from encoder network in_shape = encoder_net.in_shape() - # number of dimensions of input/output enc/dec tensors, + # number of dimensions of input/output enc/dec tensors, # needed to connect the blocks - self._enc_in_dim = len(in_shape) + self._enc_in_dim = len(in_shape) self._enc_out_dim = self.encoder_net.out_dim() self._dec_in_dim = self.decoder_net.in_dim() self._dec_out_dim = self.decoder_net.out_dim() @@ -79,26 +88,27 @@ def __init__(self, encoder_net, decoder_net, z_dim, kldiv_weight=1, self._make_post_dec_layer() self.t2qz = self._make_t2pdf_layer( - qz_pdf, qz_in_channels, self.z_dim, qz_in_dim) + qz_pdf, qz_in_channels, self.z_dim, qz_in_dim + ) self.t2px = self._make_t2pdf_layer( - px_pdf, self._dec_out_channels, self.in_channels, self._dec_out_dim) + px_pdf, self._dec_out_channels, self.in_channels, self._dec_out_dim + ) self._make_prior() - @property def pz(self): return self._pz() - - + def _compute_flatten_unflatten_shapes(self): - # if we flatten the spatial dimension to have a single + # if we flatten the spatial dimension to have a single # latent representation for all time/spatial positions - # we have to infer the spatial dimension at the encoder + # we have to infer the spatial dimension at the encoder # output - assert spatial_shape is not None, ( - 'you need to specify spatial shape at the input') - + assert ( + spatial_shape is not None + ), "you need to specify spatial shape at the input" + enc_in_shape = None, self.in_channels, *self.spatial_shape enc_out_shape = self.encoder_net.out_shape(enc_in_shape) self._enc_out_shape = enc_out_shape[1:] @@ -118,79 +128,66 @@ def _compute_flatten_unflatten_shapes(self): dec_in_tot_feats = 1 for d in self._enc_in_shape: dec_in_tot_feats *= d - - self._dec_in_tot_feats = dec_in_tot_feats - + self._dec_in_tot_feats = dec_in_tot_feats def _flatten(self, x): return x.view(-1, self._enc_out_tot_feats) - - def _unflatten(sef, x): return x.view(-1, *self._dec_in_shape) - - def _make_prior(self): if self.flatten_spatial: shape = (self.z_dim,) else: - shape = self.z_dim, *(1,)*(self._enc_out_dim - 2) + shape = self.z_dim, *(1,) * (self._enc_out_dim - 2) - if self.pz_pdf == 'std-normal': + if self.pz_pdf == "std-normal": self._pz = pdf_storage.StdNormal(shape) else: - raise ValueError('pz=%s not supported' % self.pz_pdf) - - + raise ValueError("pz=%s not supported" % self.pz_pdf) def _make_t2pdf_layer(self, pdf_name, in_channels, channels, ndims): - pdf_dict = { - 'normal-i-cov': t2pdf.Tensor2NormalICov, - 'normal-glob-diag-cov': t2pdf.Tensor2NormalGlobDiagCov, - 'normal-diag-cov': t2pdf.Tensor2NormalDiagCov, - 'bay-normal-i-cov': t2pdf.Tensor2BayNormalICovGivenNormalPrior, - 'bay-normal-glob-diag-cov': t2pdf.Tensor2BayNormalGlobDiagCovGivenNormalPrior, - 'bay-normal-diag-cov': t2pdf.Tensor2BayNormalDiagCovGivenNormalPrior } + pdf_dict = { + "normal-i-cov": t2pdf.Tensor2NormalICov, + "normal-glob-diag-cov": t2pdf.Tensor2NormalGlobDiagCov, + "normal-diag-cov": t2pdf.Tensor2NormalDiagCov, + "bay-normal-i-cov": t2pdf.Tensor2BayNormalICovGivenNormalPrior, + "bay-normal-glob-diag-cov": t2pdf.Tensor2BayNormalGlobDiagCovGivenNormalPrior, + "bay-normal-diag-cov": t2pdf.Tensor2BayNormalDiagCovGivenNormalPrior, + } t2pdf_layer = pdf_dict[pdf_name](channels, in_feats=in_channels, in_dim=ndims) return t2pdf_layer - def _make_post_enc_layer(self): pass - def _make_pre_dec_layer(self): if self.flatten_spatial: - self._pre_dec_linear = Linear(self.z_dim, self._dec_in_tot_dim) + self._pre_dec_linear = Linear(self.z_dim, self._dec_in_tot_dim) - def _make_post_dec_layer(self): pass - def _pre_enc(self, x): if x.dim() == 3 and self._enc_in_dim == 4: return x.unsqueeze(1) return x - def _post_enc(self, x): if self.flatten_spatial: x = self._flatten(x) - - return x + return x def _pre_dec(self, x): if self.flatten_spatial: - x = self._prec_dec_linear(x) #linear projection + x = self._prec_dec_linear(x) # linear projection x = self._unflatten(x) return x @@ -202,45 +199,68 @@ def _pre_dec(self, x): return x - def _post_px(self, px, x_shape): px_shape = px.batch_shape - - if len(px_shape) == 4 and len(x_shape)==3: - if px_shape[1]==1: + + if len(px_shape) == 4 and len(x_shape) == 3: + if px_shape[1] == 1: px = squeeze_pdf(px, dim=1) else: - raise ValueError('P(x|z)-shape != x-shape') - - return px + raise ValueError("P(x|z)-shape != x-shape") + return px - def forward(self, x, x_target=None, - return_x_mean=False, - return_x_sample=False, return_z_sample=False, - return_px=False, return_qz=False, serialize_pdfs=True, - use_amp=False): + def forward( + self, + x, + x_target=None, + return_x_mean=False, + return_x_sample=False, + return_z_sample=False, + return_px=False, + return_qz=False, + serialize_pdfs=True, + use_amp=False, + ): if use_amp: with torch.cuda.amp.autocast(): return self._forward( - x, x_target, - return_x_mean, return_x_sample, return_z_sample, - return_px, return_qz, serialize_pdfs) - + x, + x_target, + return_x_mean, + return_x_sample, + return_z_sample, + return_px, + return_qz, + serialize_pdfs, + ) + return self._forward( - x, x_target, - return_x_mean, return_x_sample, return_z_sample, - return_px, return_qz, serialize_pdfs) - - - def _forward(self, x, x_target=None, - return_x_mean=False, - return_x_sample=False, return_z_sample=False, - return_px=False, return_qz=False, serialize_pdfs=True): - + x, + x_target, + return_x_mean, + return_x_sample, + return_z_sample, + return_px, + return_qz, + serialize_pdfs, + ) + + def _forward( + self, + x, + x_target=None, + return_x_mean=False, + return_x_sample=False, + return_z_sample=False, + return_px=False, + return_qz=False, + serialize_pdfs=True, + ): + if x_target is None: x_target = x - + x = self._pre_enc(x) xx = self.encoder_net(x) xx = self._post_enc(xx) @@ -252,8 +272,9 @@ def _forward(self, x, x_target=None, # print(self.pz.loc) # print(self.pz.scale) - kldiv_qzpz = pdf.kl.kl_divergence(qz, self._pz()).view( - x.size(0), -1).sum(dim=-1) + kldiv_qzpz = ( + pdf.kl.kl_divergence(qz, self._pz()).view(x.size(0), -1).sum(dim=-1) + ) z = qz.rsample() zz = self._pre_dec(z) @@ -265,36 +286,31 @@ def _forward(self, x, x_target=None, px = self.t2px(zz, squeeze_dim=squeeze_dim) # we normalize the elbo by spatial/time samples and feature dimension - log_px = px.log_prob(x_target).view( - x.size(0), -1) + log_px = px.log_prob(x_target).view(x.size(0), -1) num_samples = log_px.size(-1) log_px = log_px.mean(dim=-1) # kldiv must be normalized by number of elements in x, not in z!! - kldiv_qzpz /= num_samples - elbo = log_px - self.kldiv_weight*kldiv_qzpz + kldiv_qzpz /= num_samples + elbo = log_px - self.kldiv_weight * kldiv_qzpz # we build the return dict - r = {'elbo': elbo, - 'log_px': log_px, - 'kldiv_z': kldiv_qzpz} + r = {"elbo": elbo, "log_px": log_px, "kldiv_z": kldiv_qzpz} if return_x_mean: - r['x_mean'] = px.mean - + r["x_mean"] = px.mean + if return_x_sample: if px.has_rsample: x_sample = px.rsample() else: x_sample = px.sample() - r['x_sample'] = x_sample + r["x_sample"] = x_sample if return_z_sample: - r['z'] = z + r["z"] = z return r - - def compute_qz(self, x): xx = self._pre_enc(x) @@ -303,7 +319,6 @@ def compute_qz(self, x): qz = self.t2qz(xx, self.pz) return qz - def compute_px_given_z(self, z, x_shape=None): zz = self._pre_dec(z) @@ -316,83 +331,88 @@ def compute_px_given_z(self, z, x_shape=None): px = self.t2px(zz, squeeze_dim=squeeze_dim) return px - def get_config(self): enc_cfg = self.encoder_net.get_config() dec_cfg = self.decoder_net.get_config() - config = {'encoder_cfg': enc_cfg, - 'decoder_cfg': dec_cfg, - 'z_dim': self.z_dim, - 'qz_pdf': self.qz_pdf, - 'pz_pdf': self.pz_pdf, - 'px_pdf': self.px_pdf, - 'kldiv_weight': self.kldiv_weight, - 'flatten_spatial': self.flatten_spatial, - 'spatial_shape': self.spatial_shape, - 'scale_invariant': self.scale_invariant, - 'data_scale': self.data_scale } + config = { + "encoder_cfg": enc_cfg, + "decoder_cfg": dec_cfg, + "z_dim": self.z_dim, + "qz_pdf": self.qz_pdf, + "pz_pdf": self.pz_pdf, + "px_pdf": self.px_pdf, + "kldiv_weight": self.kldiv_weight, + "flatten_spatial": self.flatten_spatial, + "spatial_shape": self.spatial_shape, + "scale_invariant": self.scale_invariant, + "data_scale": self.data_scale, + } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict( - file_path, cfg, state_dict) + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - encoder_net = TorchNALoader.load_from_cfg(cfg=cfg['encoder_cfg']) - decoder_net = TorchNALoader.load_from_cfg(cfg=cfg['decoder_cfg']) - for k in ('encoder_cfg', 'decoder_cfg'): + encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) + decoder_net = TorchNALoader.load_from_cfg(cfg=cfg["decoder_cfg"]) + for k in ("encoder_cfg", "decoder_cfg"): del cfg[k] - - model = cls(encoder_net, decoder_net, **cfg) + + model = cls(encoder_net, decoder_net, **cfg) if state_dict is not None: model.load_state_dict(state_dict) return model - - @staticmethod def filter_args(**kwargs): - valid_args = ('z_dim', 'kldiv_weight', 'qz_pdf', 'px_pdf') - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("z_dim", "kldiv_weight", "qz_pdf", "px_pdf") + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--z-dim', type=int, required=True, - help=('latent factor dimension')) + "--z-dim", type=int, required=True, help=("latent factor dimension") + ) - parser.add_argument('--kldiv-weight', default=1, type=float, - help=('weight of the KL divergance in the ELBO')) + parser.add_argument( + "--kldiv-weight", + default=1, + type=float, + help=("weight of the KL divergance in the ELBO"), + ) parser.add_argument( - '--qz-pdf', default='normal-glob-diag-cov', - choices = ['normal-i-cov', 'normal-glob-diag-cov', 'normal-diag-cov', - 'bay-normal-i-cov', 'bay-normal-glob-diag-cov', 'bay-normal-diag-cov'], - help=('pdf for approx posterior q(z)')) + "--qz-pdf", + default="normal-glob-diag-cov", + choices=[ + "normal-i-cov", + "normal-glob-diag-cov", + "normal-diag-cov", + "bay-normal-i-cov", + "bay-normal-glob-diag-cov", + "bay-normal-diag-cov", + ], + help=("pdf for approx posterior q(z)"), + ) parser.add_argument( - '--px-pdf', default='normal-glob-diag-cov', - choices = ['normal-i-cov', 'normal-glob-diag-cov', 'normal-diag-cov'], - help=('pdf for data likelihood p(x|z)')) + "--px-pdf", + default="normal-glob-diag-cov", + choices=["normal-i-cov", "normal-glob-diag-cov", "normal-diag-cov"], + help=("pdf for data likelihood p(x|z)"), + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='vae options') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='vae options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/vae/vq_vae.py b/hyperion/torch/models/vae/vq_vae.py index 0ff81044..9fcc22a0 100644 --- a/hyperion/torch/models/vae/vq_vae.py +++ b/hyperion/torch/models/vae/vq_vae.py @@ -12,7 +12,7 @@ from ...torch_model import TorchModel from ...narchs import TorchNALoader from ...layers import tensor2pdf as t2pdf -from ...layers import vq +from ...layers import vq class VQVAE(TorchModel): @@ -24,7 +24,7 @@ class VQVAE(TorchModel): decoder_net: NArch decoder network object z_dim: latent variable dimension kldiv_weight: weight KL divergene when computing ELBO - diversity_weight: weigth for log-perplexity of the codebook, + diversity_weight: weigth for log-perplexity of the codebook, it inteds to maximize the number of codewords used. vq_type: type of vector quantizer vq_gropus: number of vector quantization groups. @@ -40,13 +40,25 @@ class VQVAE(TorchModel): data_scale = for future use """ - def __init__(self, encoder_net, decoder_net, z_dim, kldiv_weight=1, - diversity_weight=0.1, - vq_type='multi-ema-k-means-vq', vq_groups=1, vq_clusters=64, - vq_commitment_cost=0.25, vq_ema_gamma=0.99, vq_ema_eps=1e-5, - px_pdf='normal-glob-diag-cov', - flatten_spatial=False, spatial_shape=None, - scale_invariant=False, data_scale=None): + def __init__( + self, + encoder_net, + decoder_net, + z_dim, + kldiv_weight=1, + diversity_weight=0.1, + vq_type="multi-ema-k-means-vq", + vq_groups=1, + vq_clusters=64, + vq_commitment_cost=0.25, + vq_ema_gamma=0.99, + vq_ema_eps=1e-5, + px_pdf="normal-glob-diag-cov", + flatten_spatial=False, + spatial_shape=None, + scale_invariant=False, + data_scale=None, + ): super().__init__() self.encoder_net = encoder_net @@ -72,9 +84,9 @@ def __init__(self, encoder_net, decoder_net, z_dim, kldiv_weight=1, # infer input feat dimension from encoder network in_shape = encoder_net.in_shape() - # number of dimension of input/output enc/dec tensors, + # number of dimension of input/output enc/dec tensors, # needed to connect the blocks - self._enc_in_dim = len(in_shape) + self._enc_in_dim = len(in_shape) self._enc_out_dim = self.encoder_net.out_dim() self._dec_in_dim = self.decoder_net.in_dim() self._dec_out_dim = self.decoder_net.out_dim() @@ -98,18 +110,18 @@ def __init__(self, encoder_net, decoder_net, z_dim, kldiv_weight=1, self._make_vq_layer(qz_in_channels, qz_in_dim) self.t2px = self._make_t2pdf_layer( - px_pdf, self._dec_out_channels, self.in_channels, self._dec_out_dim) + px_pdf, self._dec_out_channels, self.in_channels, self._dec_out_dim + ) - - def _compute_flatten_unflatten_shapes(self): - # if we flatten the spatial dimension to have a single + # if we flatten the spatial dimension to have a single # latent representation for all time/spatial positions - # we have to infer the spatial dimension at the encoder + # we have to infer the spatial dimension at the encoder # output - assert spatial_shape is not None, ( - 'you need to specify spatial shape at the input') - + assert ( + spatial_shape is not None + ), "you need to specify spatial shape at the input" + enc_in_shape = None, self.in_channels, *self.spatial_shape enc_out_shape = self.encoder_net.out_shape(enc_in_shape) self._enc_out_shape = enc_out_shape[1:] @@ -129,46 +141,36 @@ def _compute_flatten_unflatten_shapes(self): dec_in_tot_feats = 1 for d in self._enc_in_shape: dec_in_tot_feats *= d - - self._dec_in_tot_feats = dec_in_tot_feats - + self._dec_in_tot_feats = dec_in_tot_feats def _flatten(self, x): return x.view(-1, self._enc_out_tot_feats) - - def _unflatten(sef, x): return x.view(-1, *self._dec_in_shape) - - def _make_t2pdf_layer(self, pdf_name, in_channels, channels, ndims): - pdf_dict = { - 'normal-i-cov': t2pdf.Tensor2NormalICov, - 'normal-glob-diag-cov': t2pdf.Tensor2NormalGlobDiagCov, - 'normal-diag-cov': t2pdf.Tensor2NormalDiagCov } + pdf_dict = { + "normal-i-cov": t2pdf.Tensor2NormalICov, + "normal-glob-diag-cov": t2pdf.Tensor2NormalGlobDiagCov, + "normal-diag-cov": t2pdf.Tensor2NormalDiagCov, + } t2pdf_layer = pdf_dict[pdf_name](channels, in_feats=in_channels, in_dim=ndims) return t2pdf_layer - - def _make_post_enc_layer(self): pass - def _make_pre_dec_layer(self): if self.flatten_spatial: - self._pre_dec_linear = Linear(self.z_dim, self._dec_in_tot_dim) + self._pre_dec_linear = Linear(self.z_dim, self._dec_in_tot_dim) - def _make_post_dec_layer(self): pass - - + def _pre_enc(self, x): if x.dim() == 3 and self._enc_in_dim == 4: return x.unsqueeze(1) @@ -178,13 +180,12 @@ def _pre_enc(self, x): def _post_enc(self, x): if self.flatten_spatial: x = self._flatten(x) - + return x - def _pre_dec(self, x): if self.flatten_spatial: - x = self._prec_dec_linear(x) #linear projection + x = self._prec_dec_linear(x) # linear projection x = self._unflatten(x) return x @@ -196,60 +197,98 @@ def _pre_dec(self, x): return x - def _make_vq_layer(self, in_feats, in_dim): - if self.vq_type == 'multi-k-means-vq': + if self.vq_type == "multi-k-means-vq": vq_layer = vq.MultiKMeansVectorQuantizer( - self.vq_groups, self.vq_clusters, self.z_dim, - self.vq_commitment_cost, - in_feats=in_feats, in_dim=in_dim) - elif self.vq_type == 'multi-ema-k-means-vq': + self.vq_groups, + self.vq_clusters, + self.z_dim, + self.vq_commitment_cost, + in_feats=in_feats, + in_dim=in_dim, + ) + elif self.vq_type == "multi-ema-k-means-vq": vq_layer = vq.MultiEMAKMeansVectorQuantizer( - self.vq_groups, self.vq_clusters, self.z_dim, - self.vq_commitment_cost, self.vq_ema_gamma, self.vq_ema_eps, - in_feats=in_feats, in_dim=in_dim) - elif self.vq_type == 'k-means-vq': + self.vq_groups, + self.vq_clusters, + self.z_dim, + self.vq_commitment_cost, + self.vq_ema_gamma, + self.vq_ema_eps, + in_feats=in_feats, + in_dim=in_dim, + ) + elif self.vq_type == "k-means-vq": vq_layer = vq.KMeansVectorQuantizer( - self.vq_clusters, self.z_dim, + self.vq_clusters, + self.z_dim, self.vq_commitment_cost, - in_feats=in_feats, in_dim=in_dim) - elif self.vq_type == 'ema-k-means-vq': + in_feats=in_feats, + in_dim=in_dim, + ) + elif self.vq_type == "ema-k-means-vq": vq_layer = vq.EMAKMeansVectorQuantizer( - self.vq_clusters, self.z_dim, - self.vq_commitment_cost, self.vq_ema_gamma, self.vq_ema_eps, - in_feats=in_feats, in_dim=in_dim) + self.vq_clusters, + self.z_dim, + self.vq_commitment_cost, + self.vq_ema_gamma, + self.vq_ema_eps, + in_feats=in_feats, + in_dim=in_dim, + ) else: - raise ValueError('vq_type=%s not supported' % (self.vq_type)) - - self.vq_layer = vq_layer + raise ValueError("vq_type=%s not supported" % (self.vq_type)) + self.vq_layer = vq_layer - def forward(self, x, x_target=None, - return_x_mean=False, - return_x_sample=False, return_z_sample=False, - return_px=False, serialize_pdfs=True, use_amp=False): + def forward( + self, + x, + x_target=None, + return_x_mean=False, + return_x_sample=False, + return_z_sample=False, + return_px=False, + serialize_pdfs=True, + use_amp=False, + ): if use_amp: with torch.cuda.amp.autocast(): return self._forward( - x, x_target, - return_x_mean, return_x_sample, return_z_sample, - return_px, serialize_pdfs) + x, + x_target, + return_x_mean, + return_x_sample, + return_z_sample, + return_px, + serialize_pdfs, + ) return self._forward( - x, x_target, - return_x_mean, return_x_sample, return_z_sample, - return_px, serialize_pdfs) - - - def _forward(self, x, x_target=None, - return_x_mean=False, - return_x_sample=False, return_z_sample=False, - return_px=False, serialize_pdfs=True): - + x, + x_target, + return_x_mean, + return_x_sample, + return_z_sample, + return_px, + serialize_pdfs, + ) + + def _forward( + self, + x, + x_target=None, + return_x_mean=False, + return_x_sample=False, + return_z_sample=False, + return_px=False, + serialize_pdfs=True, + ): + if x_target is None: x_target = x - + xx = self._pre_enc(x) xx = self.encoder_net(xx) xx = self._post_enc(xx) @@ -257,8 +296,8 @@ def _forward(self, x, x_target=None, vq_output = self.vq_layer(xx) # extract the variables from the dict. z, vq_loss, kldiv_z, log_perplexity = ( - vq_output[i] for i in [ - 'z_q', 'loss', 'kldiv_qrpr', 'log_perplexity']) + vq_output[i] for i in ["z_q", "loss", "kldiv_qrpr", "log_perplexity"] + ) zz = self._pre_dec(z) zz = self.decoder_net(zz, target_shape=x_target.shape) @@ -268,165 +307,200 @@ def _forward(self, x, x_target=None, px = self.t2px(zz, squeeze_dim=squeeze_dim) # we normalize the elbo by spatial/time samples and feature dimension - log_px = px.log_prob(x_target).view( - x.size(0), -1) + log_px = px.log_prob(x_target).view(x.size(0), -1) num_samples = log_px.size(-1) log_px = log_px.mean(dim=-1) # kldiv must be normalized by number of elements in x, not in z!! - kldiv_z /= num_samples - elbo = log_px - self.kldiv_weight*kldiv_z + kldiv_z /= num_samples + elbo = log_px - self.kldiv_weight * kldiv_z - loss = - elbo + vq_loss - self.diversity_weight * log_perplexity + loss = -elbo + vq_loss - self.diversity_weight * log_perplexity # we build the return dict - r = {'loss': loss, - 'elbo': elbo, - 'log_px': log_px, - 'kldiv_z': kldiv_z, - 'vq_loss': vq_loss, - 'log_perplexity': log_perplexity} + r = { + "loss": loss, + "elbo": elbo, + "log_px": log_px, + "kldiv_z": kldiv_z, + "vq_loss": vq_loss, + "log_perplexity": log_perplexity, + } if return_x_mean: - r['x_mean'] = px.mean - + r["x_mean"] = px.mean + if return_x_sample: if px.has_rsample: x_sample = px.rsample() else: x_sample = px.sample() - r['x_sample'] = x_sample + r["x_sample"] = x_sample if return_z_sample: - r['z'] = z + r["z"] = z return r - def compute_z(self, x): x = self._pre_enc(x) xx = self.encoder_net(xx) xx = self._post_enc(xx) vq_output = self.vq_layer(xx) - return vq_output['z'] - + return vq_output["z"] def compute_px_given_z(self, z, x_shape=None): zz = self._pre_dec(z) - zz = self.decoder_net(zz, target_shape = x_shape) + zz = self.decoder_net(zz, target_shape=x_shape) squeeze_dim = None if x_target.dim() == 3 and zz.dim() == 4: squeeze_dim = 1 px = self.t2px(zz, squeeze_dim=squeeze_dim) return px - def get_config(self): enc_cfg = self.encoder_net.get_config() dec_cfg = self.decoder_net.get_config() - config = {'encoder_cfg': enc_cfg, - 'decoder_cfg': dec_cfg, - 'z_dim': self.z_dim, - 'vq_type': self.vq_type, - 'vq_groups': self.vq_groups, - 'vq_clusters': self.vq_clusters, - 'vq_commitment_cost': self.vq_commitment_cost, - 'vq_ema_gamma': self.vq_ema_gamma, - 'vq_ema_eps': self.vq_ema_eps, - 'px_pdf': self.px_pdf, - 'kldiv_weight': self.kldiv_weight, - 'diversity_weight': self.diversity_weight, - 'flatten_spatial': self.flatten_spatial, - 'spatial_shape': self.spatial_shape, - 'scale_invariant': self.scale_invariant, - 'data_scale': self.data_scale } + config = { + "encoder_cfg": enc_cfg, + "decoder_cfg": dec_cfg, + "z_dim": self.z_dim, + "vq_type": self.vq_type, + "vq_groups": self.vq_groups, + "vq_clusters": self.vq_clusters, + "vq_commitment_cost": self.vq_commitment_cost, + "vq_ema_gamma": self.vq_ema_gamma, + "vq_ema_eps": self.vq_ema_eps, + "px_pdf": self.px_pdf, + "kldiv_weight": self.kldiv_weight, + "diversity_weight": self.diversity_weight, + "flatten_spatial": self.flatten_spatial, + "spatial_shape": self.spatial_shape, + "scale_invariant": self.scale_invariant, + "data_scale": self.data_scale, + } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict( - file_path, cfg, state_dict) + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - encoder_net = TorchNALoader.load_from_cfg(cfg=cfg['encoder_cfg']) - decoder_net = TorchNALoader.load_from_cfg(cfg=cfg['decoder_cfg']) - for k in ('encoder_cfg', 'decoder_cfg'): + encoder_net = TorchNALoader.load_from_cfg(cfg=cfg["encoder_cfg"]) + decoder_net = TorchNALoader.load_from_cfg(cfg=cfg["decoder_cfg"]) + for k in ("encoder_cfg", "decoder_cfg"): del cfg[k] - - model = cls(encoder_net, decoder_net, **cfg) + + model = cls(encoder_net, decoder_net, **cfg) if state_dict is not None: model.load_state_dict(state_dict) return model - - - @staticmethod def filter_args(**kwargs): - valid_args = ('z_dim', 'kldiv_weight', 'diversity_weight', - 'vq_type', 'vq_groups', 'vq_clusters', - 'vq_commitment_cost', 'vq_ema_gamma', 'vq_ema_eps', 'px_pdf') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ( + "z_dim", + "kldiv_weight", + "diversity_weight", + "vq_type", + "vq_groups", + "vq_clusters", + "vq_commitment_cost", + "vq_ema_gamma", + "vq_ema_eps", + "px_pdf", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--z-dim', type=int, required=True, - help=('latent factor dimension')) + "--z-dim", type=int, required=True, help=("latent factor dimension") + ) - parser.add_argument('--kldiv-weight', default=1, type=float, - help=('weight of the KL divergance in the ELBO')) + parser.add_argument( + "--kldiv-weight", + default=1, + type=float, + help=("weight of the KL divergance in the ELBO"), + ) - parser.add_argument('--diversity-weight', default=0.1, type=float, - help=('weight of the log-perplexity in the loss')) + parser.add_argument( + "--diversity-weight", + default=0.1, + type=float, + help=("weight of the log-perplexity in the loss"), + ) parser.add_argument( - '--vq-type', default='ema-k-means-vq', - choices = ['k-means-vq', 'multi-k-means-vq', 'ema-k-means-vq', 'multi-ema-k-means-vq'], - help=('type of vector quantization layer')) + "--vq-type", + default="ema-k-means-vq", + choices=[ + "k-means-vq", + "multi-k-means-vq", + "ema-k-means-vq", + "multi-ema-k-means-vq", + ], + help=("type of vector quantization layer"), + ) parser.add_argument( - '--vq-groups', default=1, type=int, - help=('number of groups in mulit-vq layers')) + "--vq-groups", + default=1, + type=int, + help=("number of groups in mulit-vq layers"), + ) parser.add_argument( - '--vq-clusters', default=64, type=int, - help=('size of the codebooks')) + "--vq-clusters", default=64, type=int, help=("size of the codebooks") + ) - parser.add_argument('--vq-commitment-cost', default=0.25, type=float, - help=('commitment loss weight (beta in VQ-VAE paper)')) + parser.add_argument( + "--vq-commitment-cost", + default=0.25, + type=float, + help=("commitment loss weight (beta in VQ-VAE paper)"), + ) - parser.add_argument('--vq-ema-gamma', default=0.99, type=float, - help=('decay parameter for exponential moving ' - 'average calculation of the embeddings')) + parser.add_argument( + "--vq-ema-gamma", + default=0.99, + type=float, + help=( + "decay parameter for exponential moving " + "average calculation of the embeddings" + ), + ) - parser.add_argument('--vq-ema-eps', default=1e-5, type=float, - help=('pseudo-count value for Laplace smoothing ' - 'of cluster counts for exponential moving ' - 'avarage calculation of the embeddings')) + parser.add_argument( + "--vq-ema-eps", + default=1e-5, + type=float, + help=( + "pseudo-count value for Laplace smoothing " + "of cluster counts for exponential moving " + "avarage calculation of the embeddings" + ), + ) parser.add_argument( - '--px-pdf', default='normal-glob-diag-cov', - choices = ['normal-i-cov', 'normal-glob-diag-cov', 'normal-diag-cov'], - help=('pdf for data likelihood p(x|z)')) + "--px-pdf", + default="normal-glob-diag-cov", + choices=["normal-i-cov", "normal-glob-diag-cov", "normal-diag-cov"], + help=("pdf for data likelihood p(x|z)"), + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='vae options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='vae options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/xvectors/efficient_net_xvector.py b/hyperion/torch/models/xvectors/efficient_net_xvector.py index 92d92be5..0a0b174d 100644 --- a/hyperion/torch/models/xvectors/efficient_net_xvector.py +++ b/hyperion/torch/models/xvectors/efficient_net_xvector.py @@ -14,55 +14,91 @@ class EfficientNetXVector(XVector): - - def __init__(self, effnet_type, in_feats, num_classes, - in_channels=1, in_conv_channels=32, - in_kernel_size=3, in_stride=2, - mbconv_repeats=[1, 2, 2, 3, 3, 4, 1], - mbconv_channels=[16, 24, 40, 80, 112, 192, 320], - mbconv_kernel_sizes=[3, 3, 5, 3, 5, 5, 3], - mbconv_strides=[1, 2, 2, 2, 1, 2, 1], - mbconv_expansions=[1, 6, 6, 6, 6, 6, 6], - head_channels=1280, - width_scale=None, depth_scale=None, - fix_stem_head=False, - se_r=4, time_se=False, - pool_net='mean+stddev', - embed_dim=256, - num_embed_layers=1, - hid_act='swish', - loss_type='arc-softmax', - s=64, margin=0.3, margin_warmup_epochs=0, - num_subcenters=2, - drop_connect_rate=0.2, dropout_rate=0, - norm_layer=None, head_norm_layer=None, - use_norm=True, - norm_before=True, - embed_layer=0, proj_feats=None): - - logging.info('making %s encoder network' % (effnet_type)) + def __init__( + self, + effnet_type, + in_feats, + num_classes, + in_channels=1, + in_conv_channels=32, + in_kernel_size=3, + in_stride=2, + mbconv_repeats=[1, 2, 2, 3, 3, 4, 1], + mbconv_channels=[16, 24, 40, 80, 112, 192, 320], + mbconv_kernel_sizes=[3, 3, 5, 3, 5, 5, 3], + mbconv_strides=[1, 2, 2, 2, 1, 2, 1], + mbconv_expansions=[1, 6, 6, 6, 6, 6, 6], + head_channels=1280, + width_scale=None, + depth_scale=None, + fix_stem_head=False, + se_r=4, + time_se=False, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act="swish", + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + drop_connect_rate=0.2, + dropout_rate=0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=True, + embed_layer=0, + proj_feats=None, + ): + + logging.info("making %s encoder network" % (effnet_type)) encoder_net = EN( - effnet_type, in_channels, in_conv_channels, in_kernel_size, in_stride, - mbconv_repeats, mbconv_channels, mbconv_kernel_sizes, mbconv_strides, - mbconv_expansions, head_channels, - width_scale=width_scale, depth_scale=depth_scale, - fix_stem_head=fix_stem_head, hid_act=hid_act, - drop_connect_rate=drop_connect_rate, norm_layer=norm_layer, - se_r=se_r, time_se=time_se, in_feats=in_feats) - + effnet_type, + in_channels, + in_conv_channels, + in_kernel_size, + in_stride, + mbconv_repeats, + mbconv_channels, + mbconv_kernel_sizes, + mbconv_strides, + mbconv_expansions, + head_channels, + width_scale=width_scale, + depth_scale=depth_scale, + fix_stem_head=fix_stem_head, + hid_act=hid_act, + drop_connect_rate=drop_connect_rate, + norm_layer=norm_layer, + se_r=se_r, + time_se=time_se, + in_feats=in_feats, + ) + super().__init__( - encoder_net, num_classes, pool_net=pool_net, - embed_dim=embed_dim, num_embed_layers=num_embed_layers, - hid_act=hid_act, loss_type=loss_type, - s=s, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + encoder_net, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + s=s, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, num_subcenters=num_subcenters, - norm_layer=norm_layer, head_norm_layer=head_norm_layer, - use_norm=use_norm, norm_before=norm_before, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, dropout_rate=dropout_rate, - embed_layer=embed_layer, - in_feats=in_feats, proj_feats=proj_feats) + embed_layer=embed_layer, + in_feats=in_feats, + proj_feats=proj_feats, + ) - @property def effnet_type(self): return self.encoder_net.effnet_type @@ -103,7 +139,6 @@ def mbconv_strides(self): def mbconv_expansions(self): return self.encoder_net.mbconv_expansions - @property def head_channels(self): return self.encoder_net.head_channels @@ -139,45 +174,43 @@ def time_se(self): def get_config(self): base_config = super().get_config() - del base_config['encoder_cfg'] + del base_config["encoder_cfg"] pool_cfg = self.pool_net.get_config() - config = {'effnet_type': self.effnet_type, - 'in_channels': self.in_channels, - 'in_conv_channels': self.encoder_net.b0_in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'mbconv_repeats': self.encoder_net.b0_mbconv_repeats, - 'mbconv_channels': self.encoder_net.b0_mbconv_channels, - 'mbconv_kernel_sizes': self.mbconv_kernel_sizes, - 'mbconv_strides': self.mbconv_strides, - 'mbconv_expansions': self.mbconv_expansions, - 'head_channels': self.head_channels, - 'width_scale': self.encoder_net.cfg_width_scale, - 'depth_scale': self.encoder_net.cfg_width_scale, - 'fix_stem_head': self.fix_stem_head, - 'drop_connect_rate': self.drop_connect_rate, - 'se_r' : self.se_r, - 'time_se': self.time_se, - } + config = { + "effnet_type": self.effnet_type, + "in_channels": self.in_channels, + "in_conv_channels": self.encoder_net.b0_in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "mbconv_repeats": self.encoder_net.b0_mbconv_repeats, + "mbconv_channels": self.encoder_net.b0_mbconv_channels, + "mbconv_kernel_sizes": self.mbconv_kernel_sizes, + "mbconv_strides": self.mbconv_strides, + "mbconv_expansions": self.mbconv_expansions, + "head_channels": self.head_channels, + "width_scale": self.encoder_net.cfg_width_scale, + "depth_scale": self.encoder_net.cfg_width_scale, + "fix_stem_head": self.fix_stem_head, + "drop_connect_rate": self.drop_connect_rate, + "se_r": self.se_r, + "time_se": self.time_se, + } config.update(base_config) return config - @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - - cfg, state_dict = cls._load_cfg_state_dict( - file_path, cfg, state_dict) - model = cls(**cfg) + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) + + model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) return model - def filter_args(**kwargs): base_args = XVector.filter_args(**kwargs) @@ -186,24 +219,19 @@ def filter_args(**kwargs): base_args.update(child_args) return base_args - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") - # we put args of EfficientNet first so it get swish as + # we put args of EfficientNet first so it get swish as # default activation instead of relu - EN.add_class_args(parser) + EN.add_class_args(parser) XVector.add_class_args(parser) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='xvector options') - - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='xvector options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/xvectors/resnet_xvector.py b/hyperion/torch/models/xvectors/resnet_xvector.py index 029fb224..4893162d 100644 --- a/hyperion/torch/models/xvectors/resnet_xvector.py +++ b/hyperion/torch/models/xvectors/resnet_xvector.py @@ -14,44 +14,43 @@ class ResNetXVector(XVector): - def __init__(self, - resnet_type, - in_feats, - num_classes, - in_channels, - conv_channels=64, - base_channels=64, - in_kernel_size=7, - in_stride=1, - zero_init_residual=False, - groups=1, - replace_stride_with_dilation=None, - do_maxpool=False, - pool_net='mean+stddev', - embed_dim=256, - num_embed_layers=1, - hid_act={ - 'name': 'relu', - 'inplace': True - }, - loss_type='arc-softmax', - s=64, - margin=0.3, - margin_warmup_epochs=0, - num_subcenters=2, - dropout_rate=0, - norm_layer=None, - head_norm_layer=None, - use_norm=True, - norm_before=True, - in_norm=False, - embed_layer=0, - proj_feats=None, - se_r=16, - res2net_scale=4, - res2net_width_factor=1): - - logging.info('making %s encoder network', resnet_type) + def __init__( + self, + resnet_type, + in_feats, + num_classes, + in_channels, + conv_channels=64, + base_channels=64, + in_kernel_size=7, + in_stride=1, + zero_init_residual=False, + groups=1, + replace_stride_with_dilation=None, + do_maxpool=False, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu", "inplace": True}, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + dropout_rate=0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=False, + embed_layer=0, + proj_feats=None, + se_r=16, + res2net_scale=4, + res2net_width_factor=1, + ): + + logging.info("making %s encoder network", resnet_type) encoder_net = RNF.create( resnet_type, in_channels, @@ -71,27 +70,30 @@ def __init__(self, se_r=se_r, in_feats=in_feats, res2net_scale=res2net_scale, - res2net_width_factor=res2net_width_factor) - - super().__init__(encoder_net, - num_classes, - pool_net=pool_net, - embed_dim=embed_dim, - num_embed_layers=num_embed_layers, - hid_act=hid_act, - loss_type=loss_type, - s=s, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - num_subcenters=num_subcenters, - norm_layer=norm_layer, - head_norm_layer=head_norm_layer, - use_norm=use_norm, - norm_before=norm_before, - dropout_rate=dropout_rate, - embed_layer=embed_layer, - in_feats=in_feats, - proj_feats=proj_feats) + res2net_width_factor=res2net_width_factor, + ) + + super().__init__( + encoder_net, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + s=s, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + num_subcenters=num_subcenters, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + embed_layer=embed_layer, + in_feats=in_feats, + proj_feats=proj_feats, + ) self.resnet_type = resnet_type @@ -150,25 +152,25 @@ def res2net_width_factor(self): def get_config(self): base_config = super().get_config() - del base_config['encoder_cfg'] + del base_config["encoder_cfg"] pool_cfg = self.pool_net.get_config() config = { - 'resnet_type': self.resnet_type, - 'in_channels': self.in_channels, - 'conv_channels': self.conv_channels, - 'base_channels': self.base_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'zero_init_residual': self.zero_init_residual, - 'groups': self.groups, - 'replace_stride_with_dilation': self.replace_stride_with_dilation, - 'do_maxpool': self.do_maxpool, - 'in_norm': self.in_norm, - 'se_r': self.se_r, - 'res2net_scale': self.res2net_scale, - 'res2net_width_factor': self.res2net_width_factor + "resnet_type": self.resnet_type, + "in_channels": self.in_channels, + "conv_channels": self.conv_channels, + "base_channels": self.base_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "zero_init_residual": self.zero_init_residual, + "groups": self.groups, + "replace_stride_with_dilation": self.replace_stride_with_dilation, + "do_maxpool": self.do_maxpool, + "in_norm": self.in_norm, + "se_r": self.se_r, + "res2net_scale": self.res2net_scale, + "res2net_width_factor": self.res2net_width_factor, } config.update(base_config) @@ -197,14 +199,13 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") XVector.add_class_args(parser) RNF.add_class_args(parser) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='xvector options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/xvectors/spinenet_xvector.py b/hyperion/torch/models/xvectors/spinenet_xvector.py index e9929bf0..406a13a4 100644 --- a/hyperion/torch/models/xvectors/spinenet_xvector.py +++ b/hyperion/torch/models/xvectors/spinenet_xvector.py @@ -14,91 +14,94 @@ class SpineNetXVector(XVector): - def __init__(self, - spinenet_type, - in_feats, - num_classes, - in_channels, - output_levels=[3, 4, 5, 6, 7], - endpoints_num_filters=256, - resample_alpha=0.5, - block_repeats=1, - filter_size_scale=1.0, - conv_channels=64, - base_channels=64, - in_kernel_size=7, - in_stride=1, - zero_init_residual=False, - groups=1, - do_maxpool=False, - pool_net='mean+stddev', - embed_dim=256, - num_embed_layers=1, - hid_act={ - 'name': 'relu', - 'inplace': True - }, - loss_type='arc-softmax', - s=64, - margin=0.3, - margin_warmup_epochs=0, - num_subcenters=2, - dropout_rate=0, - norm_layer=None, - head_norm_layer=None, - use_norm=True, - norm_before=True, - in_norm=False, - embed_layer=0, - proj_feats=None, - se_r=16, - res2net_scale=4, - res2net_width_factor=1): - - logging.info('making %s encoder network', spinenet_type) - encoder_net = SNF.create(spinenet_type, - in_channels, - output_levels=output_levels, - endpoints_num_filters=endpoints_num_filters, - resample_alpha=resample_alpha, - block_repeats=block_repeats, - filter_size_scale=filter_size_scale, - conv_channels=conv_channels, - base_channels=base_channels, - hid_act=hid_act, - in_kernel_size=in_kernel_size, - in_stride=in_stride, - zero_init_residual=zero_init_residual, - groups=groups, - dropout_rate=dropout_rate, - norm_layer=norm_layer, - norm_before=norm_before, - do_maxpool=do_maxpool, - in_norm=in_norm, - se_r=se_r, - in_feats=in_feats, - res2net_scale=res2net_scale, - res2net_width_factor=res2net_width_factor) - - super().__init__(encoder_net, - num_classes, - pool_net=pool_net, - embed_dim=embed_dim, - num_embed_layers=num_embed_layers, - hid_act=hid_act, - loss_type=loss_type, - s=s, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - num_subcenters=num_subcenters, - norm_layer=norm_layer, - head_norm_layer=head_norm_layer, - use_norm=use_norm, - norm_before=norm_before, - dropout_rate=dropout_rate, - embed_layer=embed_layer, - in_feats=in_feats, - proj_feats=proj_feats) + def __init__( + self, + spinenet_type, + in_feats, + num_classes, + in_channels, + output_levels=[3, 4, 5, 6, 7], + endpoints_num_filters=256, + resample_alpha=0.5, + block_repeats=1, + filter_size_scale=1.0, + conv_channels=64, + base_channels=64, + in_kernel_size=7, + in_stride=1, + zero_init_residual=False, + groups=1, + do_maxpool=False, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu", "inplace": True}, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + dropout_rate=0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=False, + embed_layer=0, + proj_feats=None, + se_r=16, + res2net_scale=4, + res2net_width_factor=1, + ): + + logging.info("making %s encoder network", spinenet_type) + encoder_net = SNF.create( + spinenet_type, + in_channels, + output_levels=output_levels, + endpoints_num_filters=endpoints_num_filters, + resample_alpha=resample_alpha, + block_repeats=block_repeats, + filter_size_scale=filter_size_scale, + conv_channels=conv_channels, + base_channels=base_channels, + hid_act=hid_act, + in_kernel_size=in_kernel_size, + in_stride=in_stride, + zero_init_residual=zero_init_residual, + groups=groups, + dropout_rate=dropout_rate, + norm_layer=norm_layer, + norm_before=norm_before, + do_maxpool=do_maxpool, + in_norm=in_norm, + se_r=se_r, + in_feats=in_feats, + res2net_scale=res2net_scale, + res2net_width_factor=res2net_width_factor, + ) + + super().__init__( + encoder_net, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + s=s, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + num_subcenters=num_subcenters, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + embed_layer=embed_layer, + in_feats=in_feats, + proj_feats=proj_feats, + ) self.spinenet_type = spinenet_type @@ -173,29 +176,29 @@ def res2net_width_factor(self): def get_config(self): base_config = super().get_config() - del base_config['encoder_cfg'] + del base_config["encoder_cfg"] pool_cfg = self.pool_net.get_config() config = { - 'spinenet_type': self.spinenet_type, - 'in_channels': self.in_channels, - 'output_levels': self.output_levels, - 'endpoints_num_filters': self.endpoints_num_filters, - 'resample_alpha': self.resample_alpha, - 'block_repeats': self.block_repeats, - 'filter_size_scale': self.filter_size_scale, - 'conv_channels': self.conv_channels, - 'base_channels': self.base_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'zero_init_residual': self.zero_init_residual, - 'groups': self.groups, - 'do_maxpool': self.do_maxpool, - 'in_norm': self.in_norm, - 'res2net_scale': self.res2net_scale, - 'res2net_width_factor': self.res2net_width_factor, - 'se_r': self.se_r + "spinenet_type": self.spinenet_type, + "in_channels": self.in_channels, + "output_levels": self.output_levels, + "endpoints_num_filters": self.endpoints_num_filters, + "resample_alpha": self.resample_alpha, + "block_repeats": self.block_repeats, + "filter_size_scale": self.filter_size_scale, + "conv_channels": self.conv_channels, + "base_channels": self.base_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "zero_init_residual": self.zero_init_residual, + "groups": self.groups, + "do_maxpool": self.do_maxpool, + "in_norm": self.in_norm, + "res2net_scale": self.res2net_scale, + "res2net_width_factor": self.res2net_width_factor, + "se_r": self.se_r, } config.update(base_config) @@ -224,13 +227,12 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") XVector.add_class_args(parser) SNF.add_class_args(parser) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/torch/models/xvectors/tdnn_xvector.py b/hyperion/torch/models/xvectors/tdnn_xvector.py index 2d3c3cdf..ddd35f64 100644 --- a/hyperion/torch/models/xvectors/tdnn_xvector.py +++ b/hyperion/torch/models/xvectors/tdnn_xvector.py @@ -12,53 +12,84 @@ from .xvector import XVector from ...narchs import TDNNFactory as TF -class TDNNXVector(XVector): - def __init__(self, tdnn_type, num_enc_blocks, - in_feats, num_classes, - enc_hid_units, enc_expand_units=None, - kernel_size=3, dilation=1, dilation_factor=1, - pool_net='mean+stddev', - embed_dim=256, - num_embed_layers=1, - hid_act={'name':'relu6', 'inplace':True}, - loss_type='arc-softmax', - s=64, margin=0.3, margin_warmup_epochs=0, - num_subcenters=2, - dropout_rate=0, - norm_layer=None, head_norm_layer=None, - use_norm=True, norm_before=False, in_norm=False, - embed_layer=0, proj_feats=None): - - logging.info('making %s encoder network' % (tdnn_type)) +class TDNNXVector(XVector): + def __init__( + self, + tdnn_type, + num_enc_blocks, + in_feats, + num_classes, + enc_hid_units, + enc_expand_units=None, + kernel_size=3, + dilation=1, + dilation_factor=1, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu6", "inplace": True}, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + dropout_rate=0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=False, + in_norm=False, + embed_layer=0, + proj_feats=None, + ): + + logging.info("making %s encoder network" % (tdnn_type)) encoder_net = TF.create( - tdnn_type, num_enc_blocks, - in_feats, enc_hid_units, enc_expand_units, - kernel_size=kernel_size, - dilation=dilation, dilation_factor=dilation_factor, - hid_act=hid_act, dropout_rate=dropout_rate, - norm_layer=norm_layer, use_norm=use_norm, - norm_before=norm_before, in_norm=in_norm) + tdnn_type, + num_enc_blocks, + in_feats, + enc_hid_units, + enc_expand_units, + kernel_size=kernel_size, + dilation=dilation, + dilation_factor=dilation_factor, + hid_act=hid_act, + dropout_rate=dropout_rate, + norm_layer=norm_layer, + use_norm=use_norm, + norm_before=norm_before, + in_norm=in_norm, + ) super().__init__( - encoder_net, num_classes, pool_net=pool_net, - embed_dim=embed_dim, num_embed_layers=num_embed_layers, - hid_act=hid_act, loss_type=loss_type, - s=s, margin=margin, margin_warmup_epochs=margin_warmup_epochs, + encoder_net, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + s=s, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, num_subcenters=num_subcenters, - norm_layer=norm_layer, head_norm_layer=head_norm_layer, - use_norm=use_norm, norm_before=norm_before, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, dropout_rate=dropout_rate, - embed_layer=embed_layer, - in_feats=None, proj_feats=proj_feats) + embed_layer=embed_layer, + in_feats=None, + proj_feats=proj_feats, + ) self.tdnn_type = tdnn_type - + @property def num_enc_blocks(self): return self.encoder_net.num_blocks - @property def enc_hid_units(self): return self.encoder_net.hid_units @@ -70,7 +101,6 @@ def enc_expand_units(self): except: return None - @property def kernel_size(self): return self.encoder_net.kernel_size @@ -87,40 +117,38 @@ def dilation_factor(self): def in_norm(self): return self.encoder_net.in_norm - def get_config(self): base_config = super().get_config() - del base_config['encoder_cfg'] + del base_config["encoder_cfg"] pool_cfg = self.pool_net.get_config() - config = {'tdnn_type': self.tdnn_type, - 'num_enc_blocks': self.num_enc_blocks, - 'in_feats': self.in_feats, - 'enc_hid_units': self.enc_hid_units, - 'enc_expand_units': self.enc_expand_units, - 'kernel_size': self.kernel_size, - 'dilation': self.dilation, - 'dilation_factor': self.dilation_factor, - 'in_norm': self.in_norm } + config = { + "tdnn_type": self.tdnn_type, + "num_enc_blocks": self.num_enc_blocks, + "in_feats": self.in_feats, + "enc_hid_units": self.enc_hid_units, + "enc_expand_units": self.enc_expand_units, + "kernel_size": self.kernel_size, + "dilation": self.dilation, + "dilation_factor": self.dilation_factor, + "in_norm": self.in_norm, + } config.update(base_config) return config - @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = cls._load_cfg_state_dict( - file_path, cfg, state_dict) + cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - model = cls(**cfg) + model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) return model - def filter_args(**kwargs): base_args = XVector.filter_args(**kwargs) @@ -129,21 +157,17 @@ def filter_args(**kwargs): base_args.update(child_args) return base_args - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - + parser = ArgumentParser(prog="") + XVector.add_class_args(parser) TF.add_class_args(parser) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='xvector options') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='xvector options') add_argparse_args = add_class_args diff --git a/hyperion/torch/models/xvectors/transformer_xvector_v1.py b/hyperion/torch/models/xvectors/transformer_xvector_v1.py index e381accb..0ce13fcf 100644 --- a/hyperion/torch/models/xvectors/transformer_xvector_v1.py +++ b/hyperion/torch/models/xvectors/transformer_xvector_v1.py @@ -42,7 +42,7 @@ class TransformerXVectorV1(XVector): dropout_rate: dropout rate for ff block and classification head pos_dropout_rate: dropout rate for positional encoder att_dropout_rate: dropout rate for attention block - + use_norm: if True use batch/layer norm norm_before: if True, use layer norm before layers, otherwise after @@ -50,80 +50,84 @@ class TransformerXVectorV1(XVector): embed_layer: which layer to use to extract x-vectors proj_feats: add linear projection layer after the encoder to project feature dimension to proj_feats """ - def __init__(self, - in_feats, - num_classes, - enc_d_model=512, - num_enc_heads=4, - num_enc_blocks=6, - enc_att_type='scaled-dot-prod-v1', - enc_att_context=25, - enc_ff_type='linear', - enc_d_ff=2048, - enc_ff_kernel_size=1, - in_layer_type='conv2d-sub', - enc_concat_after=False, - pool_net='mean+stddev', - embed_dim=256, - num_embed_layers=1, - hid_act={ - 'name': 'relu6', - 'inplace': True - }, - loss_type='arc-softmax', - s=64, - margin=0.3, - margin_warmup_epochs=0, - num_subcenters=2, - dropout_rate=0.1, - pos_dropout_rate=0.1, - att_dropout_rate=0.0, - norm_layer=None, - head_norm_layer=None, - use_norm=True, - norm_before=False, - in_norm=False, - embed_layer=0, - proj_feats=None): - - logging.info('making transformer-v1 encoder network') - encoder_net = TE(in_feats, - enc_d_model, - num_enc_heads, - num_enc_blocks, - att_type=enc_att_type, - att_context=enc_att_context, - ff_type=enc_ff_type, - d_ff=enc_d_ff, - ff_kernel_size=enc_ff_kernel_size, - ff_dropout_rate=dropout_rate, - pos_dropout_rate=pos_dropout_rate, - att_dropout_rate=att_dropout_rate, - in_layer_type=in_layer_type, - norm_before=norm_before, - concat_after=enc_concat_after, - in_time_dim=-1, - out_time_dim=-1) - - super().__init__(encoder_net, - num_classes, - pool_net=pool_net, - embed_dim=embed_dim, - num_embed_layers=num_embed_layers, - hid_act=hid_act, - loss_type=loss_type, - s=s, - margin=margin, - margin_warmup_epochs=margin_warmup_epochs, - num_subcenters=num_subcenters, - norm_layer=norm_layer, - head_norm_layer=head_norm_layer, - use_norm=use_norm, - norm_before=norm_before, - dropout_rate=dropout_rate, - embed_layer=embed_layer, - in_feats=None, - proj_feats=proj_feats) + + def __init__( + self, + in_feats, + num_classes, + enc_d_model=512, + num_enc_heads=4, + num_enc_blocks=6, + enc_att_type="scaled-dot-prod-v1", + enc_att_context=25, + enc_ff_type="linear", + enc_d_ff=2048, + enc_ff_kernel_size=1, + in_layer_type="conv2d-sub", + enc_concat_after=False, + pool_net="mean+stddev", + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu6", "inplace": True}, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + dropout_rate=0.1, + pos_dropout_rate=0.1, + att_dropout_rate=0.0, + norm_layer=None, + head_norm_layer=None, + use_norm=True, + norm_before=False, + in_norm=False, + embed_layer=0, + proj_feats=None, + ): + + logging.info("making transformer-v1 encoder network") + encoder_net = TE( + in_feats, + enc_d_model, + num_enc_heads, + num_enc_blocks, + att_type=enc_att_type, + att_context=enc_att_context, + ff_type=enc_ff_type, + d_ff=enc_d_ff, + ff_kernel_size=enc_ff_kernel_size, + ff_dropout_rate=dropout_rate, + pos_dropout_rate=pos_dropout_rate, + att_dropout_rate=att_dropout_rate, + in_layer_type=in_layer_type, + norm_before=norm_before, + concat_after=enc_concat_after, + in_time_dim=-1, + out_time_dim=-1, + ) + + super().__init__( + encoder_net, + num_classes, + pool_net=pool_net, + embed_dim=embed_dim, + num_embed_layers=num_embed_layers, + hid_act=hid_act, + loss_type=loss_type, + s=s, + margin=margin, + margin_warmup_epochs=margin_warmup_epochs, + num_subcenters=num_subcenters, + norm_layer=norm_layer, + head_norm_layer=head_norm_layer, + use_norm=use_norm, + norm_before=norm_before, + dropout_rate=dropout_rate, + embed_layer=embed_layer, + in_feats=None, + proj_feats=proj_feats, + ) @property def enc_d_model(self): @@ -182,29 +186,29 @@ def enc_ff_type(self): # return self.encoder_net.in_norm def get_config(self): - """ Gets network config + """Gets network config Returns: dictionary with config params """ base_config = super(TransformerXVectorV1, self).get_config() - del base_config['encoder_cfg'] + del base_config["encoder_cfg"] pool_cfg = self.pool_net.get_config() config = { - 'num_enc_blocks': self.num_enc_blocks, - 'in_feats': self.in_feats, - 'enc_d_model': self.enc_d_model, - 'num_enc_heads': self.num_enc_heads, - 'enc_att_type': self.enc_att_type, - 'enc_att_context': self.enc_att_context, - 'enc_ff_type': self.enc_ff_type, - 'enc_d_ff': self.enc_d_ff, - 'enc_ff_kernel_size': self.enc_ff_kernel_size, - 'pos_dropout_rate': self.pos_dropout_rate, - 'att_dropout_rate': self.att_dropout_rate, - 'in_layer_type': self.in_layer_type, - 'enc_concat_after': self.enc_concat_after + "num_enc_blocks": self.num_enc_blocks, + "in_feats": self.in_feats, + "enc_d_model": self.enc_d_model, + "num_enc_heads": self.num_enc_heads, + "enc_att_type": self.enc_att_type, + "enc_att_context": self.enc_att_context, + "enc_ff_type": self.enc_ff_type, + "enc_d_ff": self.enc_d_ff, + "enc_ff_kernel_size": self.enc_ff_kernel_size, + "pos_dropout_rate": self.pos_dropout_rate, + "att_dropout_rate": self.att_dropout_rate, + "in_layer_type": self.in_layer_type, + "enc_concat_after": self.enc_concat_after, } #'in_norm': self.in_norm } @@ -213,15 +217,13 @@ def get_config(self): @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - """Loads model from file - - """ + """Loads model from file""" cfg, state_dict = cls._load_cfg_state_dict(file_path, cfg, state_dict) - #fix to load old model - if 'd_enc_ff' in cfg: - cfg['enc_d_ff'] = cfg['d_enc_ff'] - del cfg['d_enc_ff'] + # fix to load old model + if "d_enc_ff" in cfg: + cfg["enc_d_ff"] = cfg["d_enc_ff"] + del cfg["d_enc_ff"] model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) @@ -230,7 +232,7 @@ def load(cls, file_path=None, cfg=None, state_dict=None): @staticmethod def filter_args(**kwargs): - """ Filters arguments correspondin to TransformerXVector + """Filters arguments correspondin to TransformerXVector from args dictionary Args: @@ -242,11 +244,21 @@ def filter_args(**kwargs): """ base_args = XVector.filter_args(**kwargs) - valid_args = ('num_enc_blocks', 'in_feats', 'enc_d_model', - 'num_enc_heads', 'enc_att_type', 'enc_att_context', - 'enc_ff_type', 'enc_d_ff', 'enc_ff_kernel_size', - 'pos_dropout_rate', 'att_dropout_rate', 'in_layer_type', - 'enc_concat_after') + valid_args = ( + "num_enc_blocks", + "in_feats", + "enc_d_model", + "num_enc_heads", + "enc_att_type", + "enc_att_context", + "enc_ff_type", + "enc_d_ff", + "enc_ff_kernel_size", + "pos_dropout_rate", + "att_dropout_rate", + "in_layer_type", + "enc_concat_after", + ) child_args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) base_args.update(child_args) @@ -255,84 +267,97 @@ def filter_args(**kwargs): @staticmethod def add_class_args(parser, prefix=None): """Adds TransformerXVector config parameters to argparser - + Args: parser: argparse object prefix: prefix string to add to the argument names """ if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") XVector.add_class_args(parser) - parser.add_argument('--num-enc-blocks', - default=6, - type=int, - help=('number of tranformer blocks')) + parser.add_argument( + "--num-enc-blocks", + default=6, + type=int, + help=("number of tranformer blocks"), + ) - parser.add_argument('--enc-d-model', - default=512, - type=int, - help=('encoder layer sizes')) + parser.add_argument( + "--enc-d-model", default=512, type=int, help=("encoder layer sizes") + ) - parser.add_argument('--num-enc-heads', - default=4, - type=int, - help=('number of heads in self-attention layers')) + parser.add_argument( + "--num-enc-heads", + default=4, + type=int, + help=("number of heads in self-attention layers"), + ) parser.add_argument( - '--enc-att-type', - default='scaled-dot-prod-v1', - choices=['scaled-dot-prod-v1', 'local-scaled-dot-prod-v1'], - help=('type of self-attention')) + "--enc-att-type", + default="scaled-dot-prod-v1", + choices=["scaled-dot-prod-v1", "local-scaled-dot-prod-v1"], + help=("type of self-attention"), + ) - parser.add_argument('--enc-att-context', - default=25, - type=int, - help=('context size when using local attention')) + parser.add_argument( + "--enc-att-context", + default=25, + type=int, + help=("context size when using local attention"), + ) parser.add_argument( - '--enc-ff-type', - default='linear', - choices=['linear', 'conv1dx2', 'conv1dlinear'], - help=('type of feed forward layers in transformer block')) + "--enc-ff-type", + default="linear", + choices=["linear", "conv1dx2", "conv1dlinear"], + help=("type of feed forward layers in transformer block"), + ) - parser.add_argument('--enc-d-ff', - default=2048, - type=int, - help=('size middle layer in feed forward block')) + parser.add_argument( + "--enc-d-ff", + default=2048, + type=int, + help=("size middle layer in feed forward block"), + ) parser.add_argument( - '--enc-ff-kernel-size', + "--enc-ff-kernel-size", default=3, type=int, - help=('kernel size in convolutional feed forward block')) - - parser.add_argument('--pos-dropout-rate', - default=0.1, - type=float, - help='positional encoder dropout') - parser.add_argument('--att-dropout-rate', - default=0, - type=float, - help='self-att dropout') - - parser.add_argument('--in-layer-type', - default='linear', - choices=['linear', 'conv2d-sub'], - help=('type of input layer')) + help=("kernel size in convolutional feed forward block"), + ) + + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + + parser.add_argument( + "--in-layer-type", + default="linear", + choices=["linear", "conv2d-sub"], + help=("type of input layer"), + ) parser.add_argument( - '--enc-concat-after', + "--enc-concat-after", default=False, - action='store_true', - help='concatenate attention input and output instead of adding') + action="store_true", + help="concatenate attention input and output instead of adding", + ) # parser.add_argument('--in-norm', default=False, action='store_true', # help='batch normalization at the input') if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='xvector options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/classif_head.py b/hyperion/torch/narchs/classif_head.py index bfd99c43..62a2db26 100644 --- a/hyperion/torch/narchs/classif_head.py +++ b/hyperion/torch/narchs/classif_head.py @@ -33,27 +33,27 @@ class ClassifHead(NetArch): use_norm: it True it uses layer/batch-normalization norm_before: if True, layer-norm is before the activation function """ - def __init__(self, - in_feats, - num_classes, - embed_dim=256, - num_embed_layers=1, - hid_act={ - 'name': 'relu', - 'inplace': True - }, - loss_type='arc-softmax', - s=64, - margin=0.3, - margin_warmup_epochs=0, - num_subcenters=2, - norm_layer=None, - use_norm=True, - norm_before=True, - dropout_rate=0): + + def __init__( + self, + in_feats, + num_classes, + embed_dim=256, + num_embed_layers=1, + hid_act={"name": "relu", "inplace": True}, + loss_type="arc-softmax", + s=64, + margin=0.3, + margin_warmup_epochs=0, + num_subcenters=2, + norm_layer=None, + use_norm=True, + norm_before=True, + dropout_rate=0, + ): super().__init__() - assert num_embed_layers >= 1, 'num_embed_layers (%d < 1)' % num_embed_layers + assert num_embed_layers >= 1, "num_embed_layers (%d < 1)" % num_embed_layers self.num_embed_layers = num_embed_layers self.in_feats = in_feats @@ -63,7 +63,7 @@ def __init__(self, if use_norm: norm_groups = None - if norm_layer == 'group-norm': + if norm_layer == "group-norm": norm_groups = min(embed_dim // 8, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) else: @@ -83,63 +83,68 @@ def __init__(self, fc_blocks = [] for i in range(num_embed_layers - 1): fc_blocks.append( - FCBlock(prev_feats, - embed_dim, - activation=hid_act, - dropout_rate=dropout_rate, - norm_layer=self._norm_layer, - use_norm=use_norm, - norm_before=norm_before)) + FCBlock( + prev_feats, + embed_dim, + activation=hid_act, + dropout_rate=dropout_rate, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) prev_feats = embed_dim - if loss_type != 'softmax': + if loss_type != "softmax": act = None else: act = hid_act fc_blocks.append( - FCBlock(prev_feats, - embed_dim, - activation=act, - norm_layer=self._norm_layer, - use_norm=use_norm, - norm_before=norm_before)) + FCBlock( + prev_feats, + embed_dim, + activation=act, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) self.fc_blocks = nn.ModuleList(fc_blocks) # output layer - if loss_type == 'softmax': + if loss_type == "softmax": self.output = Linear(embed_dim, num_classes) - elif loss_type == 'cos-softmax': + elif loss_type == "cos-softmax": self.output = CosLossOutput( embed_dim, num_classes, s=s, margin=margin, - margin_warmup_epochs=margin_warmup_epochs) - elif loss_type == 'arc-softmax': + margin_warmup_epochs=margin_warmup_epochs, + ) + elif loss_type == "arc-softmax": self.output = ArcLossOutput( embed_dim, num_classes, s=s, margin=margin, - margin_warmup_epochs=margin_warmup_epochs) - elif loss_type == 'subcenter-arc-softmax': + margin_warmup_epochs=margin_warmup_epochs, + ) + elif loss_type == "subcenter-arc-softmax": self.output = SubCenterArcLossOutput( embed_dim, num_classes, num_subcenters, s=s, margin=margin, - margin_warmup_epochs=margin_warmup_epochs) + margin_warmup_epochs=margin_warmup_epochs, + ) - def rebuild_output_layer(self, - num_classes, - loss_type, - s, - margin, - margin_warmup_epochs, - num_subcenters=2): + def rebuild_output_layer( + self, num_classes, loss_type, s, margin, margin_warmup_epochs, num_subcenters=2 + ): embed_dim = self.embed_dim self.num_classes = num_classes @@ -149,54 +154,57 @@ def rebuild_output_layer(self, self.margin_warmup_epochs = margin_warmup_epochs self.num_subcenters = num_subcenters - if loss_type == 'softmax': + if loss_type == "softmax": self.output = Linear(embed_dim, num_classes) - elif loss_type == 'cos-softmax': + elif loss_type == "cos-softmax": self.output = CosLossOutput( embed_dim, num_classes, s=s, margin=margin, - margin_warmup_epochs=margin_warmup_epochs) - elif loss_type == 'arc-softmax': + margin_warmup_epochs=margin_warmup_epochs, + ) + elif loss_type == "arc-softmax": self.output = ArcLossOutput( embed_dim, num_classes, s=s, margin=margin, - margin_warmup_epochs=margin_warmup_epochs) - elif loss_type == 'subcenter-arc-softmax': + margin_warmup_epochs=margin_warmup_epochs, + ) + elif loss_type == "subcenter-arc-softmax": self.output = SubCenterArcLossOutput( embed_dim, num_classes, num_subcenters, s=s, margin=margin, - margin_warmup_epochs=margin_warmup_epochs) + margin_warmup_epochs=margin_warmup_epochs, + ) def set_margin(self, margin): - if self.loss_type == 'softmax': + if self.loss_type == "softmax": return self.margin = margin self.output.margin = margin def set_margin_warmup_epochs(self, margin_warmup_epochs): - if self.loss_type == 'softmax': + if self.loss_type == "softmax": return self.margin_warmup_epochs = margin_warmup_epochs self.output.margin_warmup_epochs = margin_warmup_epochs def set_s(self, s): - if self.loss_type == 'softmax': + if self.loss_type == "softmax": return self.s = s self.output.s = s def update_margin(self, epoch): - if hasattr(self.output, 'update_margin'): + if hasattr(self.output, "update_margin"): self.output.update_margin(epoch) def freeze_layers(self, layer_list): @@ -213,7 +221,7 @@ def forward(self, x, y=None): for l in range(self.num_embed_layers): x = self.fc_blocks[l](x) - if self.loss_type == 'softmax': + if self.loss_type == "softmax": y = self.output(x) else: y = self.output(x, y) @@ -232,7 +240,7 @@ def forward_hid_feats(self, x, y=None, layers=None, return_output=False): if l in layers: h.append(x) - if self.loss_type == 'softmax': + if self.loss_type == "softmax": y = self.output(x) else: y = self.output(x, y) @@ -254,20 +262,20 @@ def get_config(self): hid_act = AF.get_config(self.fc_blocks[0].activation) config = { - 'in_feats': self.in_feats, - 'num_classes': self.num_classes, - 'embed_dim': self.embed_dim, - 'num_embed_layers': self.num_embed_layers, - 'hid_act': hid_act, - 'lost_type': self.lost_type, - 's': self.s, - 'margin': self.margin, - 'margin_warmup_epochs': self.margin_warmup_epochs, - 'num_subcenters': self.num_subcenters, - 'norm_layer': self.norm_layer, - 'use_norm': self.use_norm, - 'norm_before': self.norm_before, - 'dropout_rate': self.dropout_rate + "in_feats": self.in_feats, + "num_classes": self.num_classes, + "embed_dim": self.embed_dim, + "num_embed_layers": self.num_embed_layers, + "hid_act": hid_act, + "lost_type": self.lost_type, + "s": self.s, + "margin": self.margin, + "margin_warmup_epochs": self.margin_warmup_epochs, + "num_subcenters": self.num_subcenters, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + "dropout_rate": self.dropout_rate, } base_config = super().get_config() @@ -276,18 +284,29 @@ def get_config(self): @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - valid_args = ('num_classes', 'embed_dim', 'num_embed_layers', - 'hid_act', 'loss_type', 's', 'margin', - 'margin_warmup_epochs', 'num_subcenters', 'use_norm', - 'norm_before', 'dropout_rate', 'norm_layer') + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "num_classes", + "embed_dim", + "num_embed_layers", + "hid_act", + "loss_type", + "s", + "margin", + "margin_warmup_epochs", + "num_subcenters", + "use_norm", + "norm_before", + "dropout_rate", + "norm_layer", + ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @@ -295,92 +314,88 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") - parser.add_argument('--embed-dim', - default=256, - type=int, - help=('x-vector dimension')) + parser.add_argument( + "--embed-dim", default=256, type=int, help=("x-vector dimension") + ) - parser.add_argument('--num-embed-layers', - default=1, - type=int, - help=('number of layers in the classif head')) + parser.add_argument( + "--num-embed-layers", + default=1, + type=int, + help=("number of layers in the classif head"), + ) try: - parser.add_argument('--hid-act', - default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass parser.add_argument( - '--loss-type', - default='arc-softmax', - choices=[ - 'softmax', 'arc-softmax', 'cos-softmax', - 'subcenter-arc-softmax' - ], - help= - 'loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax' + "--loss-type", + default="arc-softmax", + choices=["softmax", "arc-softmax", "cos-softmax", "subcenter-arc-softmax"], + help="loss type: softmax, arc-softmax, cos-softmax, subcenter-arc-softmax", ) - parser.add_argument('--s', - default=64, - type=float, - help='scale for arcface') + parser.add_argument("--s", default=64, type=float, help="scale for arcface") - parser.add_argument('--margin', - default=0.3, - type=float, - help='margin for arcface, cosface,...') + parser.add_argument( + "--margin", default=0.3, type=float, help="margin for arcface, cosface,..." + ) parser.add_argument( - '--margin-warmup-epochs', + "--margin-warmup-epochs", default=10, type=float, - help='number of epoch until we set the final margin') + help="number of epoch until we set the final margin", + ) - parser.add_argument('--num-subcenters', - default=2, - type=int, - help='number of subcenters in subcenter losses') + parser.add_argument( + "--num-subcenters", + default=2, + type=int, + help="number of subcenters in subcenter losses", + ) try: parser.add_argument( - '--norm-layer', + "--norm-layer", default=None, choices=[ - 'batch-norm', 'group-norm', 'instance-norm', - 'instance-norm-affine', 'layer-norm' + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", ], - help= - 'type of normalization layer for all components of x-vector network' + help="type of normalization layer for all components of x-vector network", ) except: pass - parser.add_argument('--wo-norm', - default=False, - action='store_true', - help='without batch normalization') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) - parser.add_argument('--norm-after', - default=False, - action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) try: - parser.add_argument('--dropout-rate', - default=0, - type=float, - help='dropout') + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") except: pass if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='classification head options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/conformer_encoder_v1.py b/hyperion/torch/narchs/conformer_encoder_v1.py index e8348055..69f9300c 100644 --- a/hyperion/torch/narchs/conformer_encoder_v1.py +++ b/hyperion/torch/narchs/conformer_encoder_v1.py @@ -15,13 +15,14 @@ from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from .net_arch import NetArch + class ConformerEncoderV1(NetArch): """Conformer encoder introduced in https://arxiv.org/pdf/2005.08100.pdf - This includes some optional extra features + This includes some optional extra features not included in the original paper: - - Choose local-attention (attending only to close frames + - Choose local-attention (attending only to close frames instead of all the frames in the sequence) - Choose number of conv blocks in each conformer layer - Squeeze-Excitation after depthwise-conv @@ -54,12 +55,12 @@ class ConformerEncoderV1(NetArch): that query q_i only attents to key k_j when j<=i no_pos_enc: if True, it doesn't use positional encoder. hid_act: hidden activations in ff and input blocks - conv_norm_layer: norm layer constructor or str for conv block, + conv_norm_layer: norm layer constructor or str for conv block, if None it uses BatchNorm1d se_r: Squeeze-Excitation compression ratio, if None it doesn't use Squeeze-Excitation ff_macaron: if True, it uses macaron-net style ff layers, otherwise transformer style. - red_lnorms: it True, use redundant LNorm layers at the output of the conformer blocks as + red_lnorms: it True, use redundant LNorm layers at the output of the conformer blocks as in the paper concat_after: if True, if concats attention input and output and apply linear transform, i.e., y = x + linear(concat(x, att(x))) @@ -71,35 +72,57 @@ class ConformerEncoderV1(NetArch): red_lnorm: (deprecated) """ - def __init__(self, in_feats, d_model=256, num_heads=4, num_blocks=6, - att_type='scaled-dot-prod-v1', att_context=25, - conv_repeats=1, conv_kernel_sizes=31, conv_strides=1, - ff_type='linear', d_ff=2048, ff_kernel_size=1, - dropout_rate=0.1, pos_dropout_rate=0.1, att_dropout_rate=0.0, - in_layer_type='conv2d-sub', - pos_enc_type='rel', - causal_pos_enc=False, - hid_act='swish', - conv_norm_layer=None, se_r=None, - ff_macaron=True, red_lnorms=False, concat_after=False, - padding_idx=-1, in_time_dim=-1, out_time_dim=1, - rel_pos_enc=True, red_lnorm=False): + def __init__( + self, + in_feats, + d_model=256, + num_heads=4, + num_blocks=6, + att_type="scaled-dot-prod-v1", + att_context=25, + conv_repeats=1, + conv_kernel_sizes=31, + conv_strides=1, + ff_type="linear", + d_ff=2048, + ff_kernel_size=1, + dropout_rate=0.1, + pos_dropout_rate=0.1, + att_dropout_rate=0.0, + in_layer_type="conv2d-sub", + pos_enc_type="rel", + causal_pos_enc=False, + hid_act="swish", + conv_norm_layer=None, + se_r=None, + ff_macaron=True, + red_lnorms=False, + concat_after=False, + padding_idx=-1, + in_time_dim=-1, + out_time_dim=1, + rel_pos_enc=True, + red_lnorm=False, + ): super().__init__() self.in_feats = in_feats self.d_model = d_model self.num_heads = num_heads self.num_blocks = num_blocks - + self.att_type = att_type self.att_context = att_context self.conv_repeats = self._standarize_cblocks_param( - conv_repeats, num_blocks, 'conv_repeats') + conv_repeats, num_blocks, "conv_repeats" + ) self.conv_kernel_sizes = self._standarize_cblocks_param( - conv_kernel_sizes, num_blocks, 'conv_kernel_sizes') + conv_kernel_sizes, num_blocks, "conv_kernel_sizes" + ) self.conv_strides = self._standarize_cblocks_param( - conv_strides, num_blocks, 'conv_strides') + conv_strides, num_blocks, "conv_strides" + ) self.ff_type = ff_type self.d_ff = d_ff @@ -121,31 +144,43 @@ def __init__(self, in_feats, d_model=256, num_heads=4, num_blocks=6, self.conv_norm_layer = conv_norm_layer norm_groups = None - if conv_norm_layer == 'group-norm': - norm_groups = min(d_model//2, 32) + if conv_norm_layer == "group-norm": + norm_groups = min(d_model // 2, 32) self._conv_norm_layer = NLF.create(conv_norm_layer, norm_groups) self._make_in_layer() blocks = [] for i in range(num_blocks): - blocks.append(EBlock( - d_model, att_type, num_heads, - self.conv_repeats[i], - self.conv_kernel_sizes[i], self.conv_strides[i], - ff_type, d_ff, ff_kernel_size, - hid_act=hid_act, dropout_rate=dropout_rate, - att_context=att_context, att_dropout_rate=att_dropout_rate, - pos_enc_type=pos_enc_type, causal_pos_enc=causal_pos_enc, - conv_norm_layer=self._conv_norm_layer, se_r = se_r, - ff_macaron=ff_macaron, out_lnorm=self.red_lnorms, - concat_after=concat_after)) + blocks.append( + EBlock( + d_model, + att_type, + num_heads, + self.conv_repeats[i], + self.conv_kernel_sizes[i], + self.conv_strides[i], + ff_type, + d_ff, + ff_kernel_size, + hid_act=hid_act, + dropout_rate=dropout_rate, + att_context=att_context, + att_dropout_rate=att_dropout_rate, + pos_enc_type=pos_enc_type, + causal_pos_enc=causal_pos_enc, + conv_norm_layer=self._conv_norm_layer, + se_r=se_r, + ff_macaron=ff_macaron, + out_lnorm=self.red_lnorms, + concat_after=concat_after, + ) + ) self.blocks = nn.ModuleList(blocks) if not self.red_lnorms: self.norm_out = nn.LayerNorm(d_model) - @staticmethod def _standarize_cblocks_param(p, num_blocks, p_name): if isinstance(p, int): @@ -153,28 +188,30 @@ def _standarize_cblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p - def _make_in_layer(self): in_feats = self.in_feats d_model = self.d_model dropout_rate = self.dropout_rate - if self.pos_enc_type == 'no': + if self.pos_enc_type == "no": pos_enc = NoPosEncoder() - elif self.pos_enc_type == 'rel': + elif self.pos_enc_type == "rel": pos_enc = RelPosEncoder(d_model, self.pos_dropout_rate) - elif self.pos_enc_type == 'abs': + elif self.pos_enc_type == "abs": pos_enc = PosEncoder(d_model, self.pos_dropout_rate) else: - raise Exception('wrong pos-enc-type={}'.format(self.pos_enc_type)) + raise Exception("wrong pos-enc-type={}".format(self.pos_enc_type)) hid_act = AF.create(self.hid_act) @@ -184,24 +221,23 @@ def _make_in_layer(self): nn.LayerNorm(d_model), nn.Dropout(dropout_rate), hid_act, - pos_enc) + pos_enc, + ) elif self.in_layer_type == "conv2d-sub": self.in_layer = Conv2dSubsampler( - in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim) + in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim + ) elif self.in_layer_type == "embed": self.in_layer = nn.Sequential( - nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), - pos_enc) + nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc + ) elif isinstance(self.in_layer_type, nn.Module): - self.in_layer = nn.Sequential( - in_layer_type, - pos_enc) + self.in_layer = nn.Sequential(in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: raise ValueError("unknown in_layer_type: " + self.in_layer_type) - def forward(self, x, mask=None, target_shape=None): """Forward pass function @@ -222,7 +258,7 @@ def forward(self, x, mask=None, target_shape=None): if isinstance(x, tuple): x, pos_emb = x - b_args = {'pos_emb': pos_emb} + b_args = {"pos_emb": pos_emb} else: b_args = {} @@ -240,50 +276,50 @@ def forward(self, x, mask=None, target_shape=None): return x, mask - def get_config(self): - """ Gets network config + """Gets network config Returns: dictionary with config params """ - config = {'in_feats': self.in_feats, - 'd_model': self.d_model, - 'num_heads': self.num_heads, - 'num_blocks': self.num_blocks, - 'att_type': self.att_type, - 'att_context': self.att_context, - 'conv_repeats': self.conv_repeats, - 'conv_kernel_sizes': self.conv_kernel_sizes, - 'conv_strides': self.conv_strides, - 'ff_type': self.ff_type, - 'd_ff': self.d_ff, - 'ff_kernel_size': self.ff_kernel_size, - 'dropout_rate': self.dropout_rate, - 'att_dropout_rate': self.att_dropout_rate, - 'pos_dropout_rate': self.pos_dropout_rate, - 'in_layer_type': self.in_layer_type, - 'pos_enc_type': self.pos_enc_type, - 'causal_pos_enc': self.causal_pos_enc, - 'hid_act': self.hid_act, - 'se_r': self.se_r, - 'ff_macaron': self.ff_macaron, - 'red_lnorms': self.red_lnorms, - 'conv_norm_layer': self.conv_norm_layer, - 'concat_after': self.concat_after, - 'padding_idx': self.padding_idx, - 'in_time_dim': self.in_time_dim, - 'out_time_dim': self.out_time_dim } - + config = { + "in_feats": self.in_feats, + "d_model": self.d_model, + "num_heads": self.num_heads, + "num_blocks": self.num_blocks, + "att_type": self.att_type, + "att_context": self.att_context, + "conv_repeats": self.conv_repeats, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "ff_type": self.ff_type, + "d_ff": self.d_ff, + "ff_kernel_size": self.ff_kernel_size, + "dropout_rate": self.dropout_rate, + "att_dropout_rate": self.att_dropout_rate, + "pos_dropout_rate": self.pos_dropout_rate, + "in_layer_type": self.in_layer_type, + "pos_enc_type": self.pos_enc_type, + "causal_pos_enc": self.causal_pos_enc, + "hid_act": self.hid_act, + "se_r": self.se_r, + "ff_macaron": self.ff_macaron, + "red_lnorms": self.red_lnorms, + "conv_norm_layer": self.conv_norm_layer, + "concat_after": self.concat_after, + "padding_idx": self.padding_idx, + "in_time_dim": self.in_time_dim, + "out_time_dim": self.out_time_dim, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def in_context(self): return (self.att_context, self.att_context) def in_shape(self): """Input shape for network - + Returns: Tuple describing input shape """ @@ -292,7 +328,6 @@ def in_shape(self): else: return (None, self.in_feats, None) - def out_shape(self, in_shape=None): """Infers the network output shape given the input shape @@ -313,8 +348,8 @@ def out_shape(self, in_shape=None): out_t = None else: if isinstance(self.in_layer, Conv2dSubsampler): - #out_t = in_t//4 - out_t = ((in_t - 1)//2 - 1)//2 + # out_t = in_t//4 + out_t = ((in_t - 1) // 2 - 1) // 2 else: out_t = in_t @@ -323,11 +358,9 @@ def out_shape(self, in_shape=None): else: return (batch_size, self.d_model, out_t) - - @staticmethod - def filter_args( **kwargs): - """ Filters arguments correspondin to TransformerXVector + def filter_args(**kwargs): + """Filters arguments correspondin to TransformerXVector from args dictionary Args: @@ -337,157 +370,218 @@ def filter_args( **kwargs): args dictionary """ - if 'no_ff_macaron' in kwargs: - kwargs['ff_macaron'] = not kwargs['no_ff_macaron'] - - valid_args = ('num_blocks', - 'in_feats', - 'd_model', - 'num_heads', - 'att_type', - 'att_context', - 'conv_repeats', - 'conv_kernel_sizes', - 'conv_strides', - 'ff_type', - 'd_ff', - 'ff_kernel_size', - 'dropout_rate', - 'pos_dropout_rate', - 'att_dropout_rate', - 'in_layer_type', - 'hid_act', - 'pos_enc_type', - 'causal_pos_enc', - 'conv_norm_layer', - 'se_r', - 'ff_macaron', - 'red_lnorms', - 'concat_after') - - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + if "no_ff_macaron" in kwargs: + kwargs["ff_macaron"] = not kwargs["no_ff_macaron"] + + valid_args = ( + "num_blocks", + "in_feats", + "d_model", + "num_heads", + "att_type", + "att_context", + "conv_repeats", + "conv_kernel_sizes", + "conv_strides", + "ff_type", + "d_ff", + "ff_kernel_size", + "dropout_rate", + "pos_dropout_rate", + "att_dropout_rate", + "in_layer_type", + "hid_act", + "pos_enc_type", + "causal_pos_enc", + "conv_norm_layer", + "se_r", + "ff_macaron", + "red_lnorms", + "concat_after", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None, in_feats=False): """Adds Conformer config parameters to argparser - + Args: parser: argparse object prefix: prefix string to add to the argument names """ if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") if in_feats: parser.add_argument( - '--in-feats', type=int, default=80, - help=('input feature dimension')) + "--in-feats", type=int, default=80, help=("input feature dimension") + ) + parser.add_argument( + "--num-blocks", default=6, type=int, help=("number of tranformer blocks") + ) - parser.add_argument('--num-blocks', - default=6, type=int, - help=('number of tranformer blocks')) + parser.add_argument( + "--d-model", default=512, type=int, help=("encoder layer sizes") + ) - parser.add_argument('--d-model', - default=512, type=int, - help=('encoder layer sizes')) + parser.add_argument( + "--num-heads", + default=4, + type=int, + help=("number of heads in self-attention layers"), + ) - parser.add_argument('--num-heads', - default=4, type=int, - help=('number of heads in self-attention layers')) + parser.add_argument( + "--att-type", + default="scaled-dot-prod-v1", + choices=["scaled-dot-prod-v1", "local-scaled-dot-prod-v1"], + help=("type of self-attention"), + ) - parser.add_argument('--att-type', - default='scaled-dot-prod-v1', - choices=['scaled-dot-prod-v1', 'local-scaled-dot-prod-v1'], - help=('type of self-attention')) + parser.add_argument( + "--att-context", + default=25, + type=int, + help=("context size when using local attention"), + ) - parser.add_argument('--att-context', - default=25, type=int, - help=('context size when using local attention')) + parser.add_argument( + "--conv-repeats", + default=[1], + type=int, + nargs="+", + help=("number of conv blocks in each conformer block"), + ) parser.add_argument( - '--conv-repeats', default=[1], type=int, - nargs='+', help=('number of conv blocks in each conformer block')) + "--conv-kernel-sizes", + default=[31], + nargs="+", + type=int, + help=("kernels sizes for the depth-wise convs of each conformer block"), + ) parser.add_argument( - '--conv-kernel-sizes', default=[31], - nargs='+', type=int, - help=('kernels sizes for the depth-wise convs of each conformer block')) + "--conv-strides", + default=[1], + nargs="+", + type=int, + help=("resb-blocks strides for each encoder stage"), + ) parser.add_argument( - '--conv-strides', default=[1], - nargs='+', type=int, help=('resb-blocks strides for each encoder stage')) + "--ff-type", + default="linear", + choices=["linear", "conv1dx2", "conv1dlinear"], + help=("type of feed forward layers in transformer block"), + ) - parser.add_argument('--ff-type', - default='linear', choices=['linear', 'conv1dx2', 'conv1dlinear'], - help=('type of feed forward layers in transformer block')) - - parser.add_argument('--d-ff', - default=2048, type=int, - help=('size middle layer in feed forward block')) + parser.add_argument( + "--d-ff", + default=2048, + type=int, + help=("size middle layer in feed forward block"), + ) - parser.add_argument('--ff-kernel-size', - default=3, type=int, - help=('kernel size in convolutional feed forward block')) + parser.add_argument( + "--ff-kernel-size", + default=3, + type=int, + help=("kernel size in convolutional feed forward block"), + ) try: - parser.add_argument('--hid-act', default='swish', - help='hidden activation') + parser.add_argument("--hid-act", default="swish", help="hidden activation") except: pass - parser.add_argument('--pos-dropout-rate', default=0.1, type=float, - help='positional encoder dropout') - parser.add_argument('--att-dropout-rate', default=0, type=float, - help='self-att dropout') - parser.add_argument('--dropout-rate', default=0.1, type=float, - help='feed-forward layer dropout') - - parser.add_argument('--in-layer-type', - default='linear', choices=['linear', 'conv2d-sub'], - help=('type of input layer')) + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + parser.add_argument( + "--dropout-rate", default=0.1, type=float, help="feed-forward layer dropout" + ) + + parser.add_argument( + "--in-layer-type", + default="linear", + choices=["linear", "conv2d-sub"], + help=("type of input layer"), + ) # parser.add_argument('--abs-pos-enc', default=False, action='store_true', # help='use absolute positional encoder') - parser.add_argument('--pos-enc-type', - default='rel', choices=['no', 'rel', 'abs'], - help=('type of positional encoder')) + parser.add_argument( + "--pos-enc-type", + default="rel", + choices=["no", "rel", "abs"], + help=("type of positional encoder"), + ) - parser.add_argument('--causal-pos-enc', default=False, action='store_true', - help='relative positional encodings are zero when attending to the future') + parser.add_argument( + "--causal-pos-enc", + default=False, + action="store_true", + help="relative positional encodings are zero when attending to the future", + ) try: parser.add_argument( - '--conv-norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer for conv block in conformer') + "--conv-norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer for conv block in conformer", + ) except: pass parser.add_argument( - '--se-r', default=None, type=int, - help=('squeeze-excitation compression ratio')) + "--se-r", + default=None, + type=int, + help=("squeeze-excitation compression ratio"), + ) - parser.add_argument('--no-ff-macaron', default=False, action='store_true', - help='do not use macaron style ff layers ') + parser.add_argument( + "--no-ff-macaron", + default=False, + action="store_true", + help="do not use macaron style ff layers ", + ) - parser.add_argument('--red-lnorms', default=False, action='store_true', - help='use redundant Lnorm at conformer blocks\' outputs') + parser.add_argument( + "--red-lnorms", + default=False, + action="store_true", + help="use redundant Lnorm at conformer blocks' outputs", + ) - parser.add_argument('--concat-after', default=False, action='store_true', - help='concatenate attention input and output instead of adding') + parser.add_argument( + "--concat-after", + default=False, + action="store_true", + help="concatenate attention input and output instead of adding", + ) # parser.add_argument('--in-norm', default=False, action='store_true', # help='batch normalization at the input') if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='conformer encoder options') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='conformer encoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/dc1d_decoder.py b/hyperion/torch/narchs/dc1d_decoder.py index d43d9923..c35d7720 100644 --- a/hyperion/torch/narchs/dc1d_decoder.py +++ b/hyperion/torch/narchs/dc1d_decoder.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math +import math from jsonargparse import ArgumentParser, ActionParser import torch @@ -17,23 +17,25 @@ class DC1dDecoder(NetArch): - - def __init__(self, in_channels=32, - in_conv_channels=32, - in_kernel_size=3, - in_stride=1, - conv_repeats=[1, 1, 1], - conv_channels=[64, 128, 128], - conv_kernel_sizes=3, - conv_strides=2, - conv_dilations=1, - head_channels=0, - hid_act='relu6', - head_act=None, - dropout_rate=0, - use_norm=True, - norm_layer=None, - norm_before=True): + def __init__( + self, + in_channels=32, + in_conv_channels=32, + in_kernel_size=3, + in_stride=1, + conv_repeats=[1, 1, 1], + conv_channels=[64, 128, 128], + conv_kernel_sizes=3, + conv_strides=2, + conv_dilations=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.in_channels = in_channels @@ -43,13 +45,17 @@ def __init__(self, in_channels=32, num_superblocks = len(conv_repeats) self.conv_repeats = conv_repeats self.conv_channels = self._standarize_convblocks_param( - conv_channels, num_superblocks, 'conv_channels') + conv_channels, num_superblocks, "conv_channels" + ) self.conv_kernel_sizes = self._standarize_convblocks_param( - conv_kernel_sizes, num_superblocks, 'conv_kernel_sizes') + conv_kernel_sizes, num_superblocks, "conv_kernel_sizes" + ) self.conv_strides = self._standarize_convblocks_param( - conv_strides, num_superblocks, 'conv_strides') + conv_strides, num_superblocks, "conv_strides" + ) self.conv_dilations = self._standarize_convblocks_param( - conv_dilations, num_superblocks, 'conv_dilations') + conv_dilations, num_superblocks, "conv_dilations" + ) self.head_channels = head_channels self.hid_act = hid_act self.head_act = head_act @@ -59,23 +65,28 @@ def __init__(self, in_channels=32, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(self.conv_channels)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block self.in_block = DC1dDecBlock( - in_channels, in_conv_channels, in_kernel_size, - stride=in_stride, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) + in_channels, + in_conv_channels, + in_kernel_size, + stride=in_stride, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) self._context = self.in_block.context self._upsample_factor = self.in_block.stride cur_in_channels = in_conv_channels - - #middle blocks + + # middle blocks self.blocks = nn.ModuleList([]) for i in range(num_superblocks): repeats_i = self.conv_repeats[i] @@ -84,50 +95,69 @@ def __init__(self, in_channels=32, kernel_size_i = self.conv_kernel_sizes[i] dilation_i = self.conv_dilations[i] block_i = DC1dDecBlock( - cur_in_channels, channels_i, kernel_size_i, - stride=stride_i, dilation=1, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + cur_in_channels, + channels_i, + kernel_size_i, + stride=stride_i, + dilation=1, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor self._upsample_factor *= block_i.stride - for j in range(repeats_i-1): + for j in range(repeats_i - 1): block_i = DC1dDecBlock( - channels_i, channels_i, kernel_size_i, - stride=1, dilation=dilation_i, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + channels_i, + channels_i, + kernel_size_i, + stride=1, + dilation=dilation_i, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor cur_in_channels = channels_i - #head feature block + # head feature block if self.head_channels > 0: self.head_block = DC1dDecBlock( - cur_in_channels, head_channels, kernel_size=1, - stride=1, activation=head_act, - use_norm=False, norm_before=norm_before) + cur_in_channels, + head_channels, + kernel_size=1, + stride=1, + activation=head_act, + use_norm=False, + norm_before=norm_before, + ) self._init_weights(hid_act) - - def _init_weights(self, hid_act): if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name in ['relu6', 'swish']: - act_name = 'relu' + act_name = hid_act["name"] + if act_name in ["relu6", "swish"]: + act_name = "relu" - init_f1 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity=act_name) - init_f2 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity='relu') + init_f1 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity=act_name + ) + init_f2 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity="relu" + ) for m in self.modules(): if isinstance(m, nn.Conv1d): @@ -139,15 +169,13 @@ def _init_weights(self, hid_act): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) - #re-init subpixelconvs + # re-init subpixelconvs for m in self.modules(): if isinstance(m, SubPixelConv1d): try: ICNR1d(m.conv.weight, stride=m.stride, initializer=init_f1) except: ICNR1d(m.conv.weight, stride=m.stride, initializer=init_f2) - - @staticmethod def _standarize_convblocks_param(p, num_blocks, p_name): @@ -156,36 +184,37 @@ def _standarize_convblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p - def _compute_out_size(self, in_size): out_size = in_size * in_stride for stride in self.conv_strides: - out_size *= stride + out_size *= stride return out_size - def in_context(self): - in_context = int(math.ceil(self._context/self._upsample_factor)) + in_context = int(math.ceil(self._context / self._upsample_factor)) return (in_context, in_context) - def in_shape(self): return (None, self.in_channels, None) - def out_shape(self, in_shape=None): - out_channels = self.head_channels if self.head_channels>0 else self.conv_channels[-1] + out_channels = ( + self.head_channels if self.head_channels > 0 else self.conv_channels[-1] + ) if in_shape is None: return (None, out_channels, None) @@ -197,20 +226,16 @@ def out_shape(self, in_shape=None): return (in_shape[0], out_channels, T) - - def _match_shape(self, x, target_shape): t = x.size(-1) target_t = target_shape[-1] surplus = t - target_t assert surplus >= 0 if surplus > 0: - x = torch.narrow(x, -1, surplus//2, target_t).contiguous() + x = torch.narrow(x, -1, surplus // 2, target_t).contiguous() return x - - def forward(self, x, target_shape=None): x = self.in_block(x) @@ -225,142 +250,190 @@ def forward(self, x, target_shape=None): return x - def get_config(self): - + head_act = self.head_act hid_act = self.hid_act - config = {'in_channels': self.in_channels, - 'in_conv_channels': self.in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'conv_repeats': self.conv_repeats, - 'conv_channels': self.conv_channels, - 'conv_kernel_sizes': self.conv_kernel_sizes, - 'conv_strides': self.conv_strides, - 'conv_dilations': self.conv_dilations, - 'head_channels': self.head_channels, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act, - 'head_act': head_act, - 'use_norm': self.use_norm, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - } - + config = { + "in_channels": self.in_channels, + "in_conv_channels": self.in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "conv_repeats": self.conv_repeats, + "conv_channels": self.conv_channels, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "conv_dilations": self.conv_dilations, + "head_channels": self.head_channels, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + "head_act": head_act, + "use_norm": self.use_norm, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - - valid_args = ('in_channels', - 'in_conv_channels', 'in_kernel_size', 'in_stride', - 'conv_repeats', 'conv_channels', 'conv_kernel_sizes', - 'conv_strides', 'conv_dilations', - 'head_channels', - 'hid_act', 'had_act', - 'dropout_rate', - 'use_norm', 'norm_layer', 'norm_before') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "in_channels", + "in_conv_channels", + "in_kernel_size", + "in_stride", + "conv_repeats", + "conv_channels", + "conv_kernel_sizes", + "conv_strides", + "conv_dilations", + "head_channels", + "hid_act", + "had_act", + "dropout_rate", + "use_norm", + "norm_layer", + "norm_before", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None, head_channels=False): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--in-channels', type=int, default=80, - help=('input channels of decoder')) + "--in-channels", type=int, default=80, help=("input channels of decoder") + ) parser.add_argument( - '--in-conv-channels', default=128, type=int, - help=('number of output channels in input convolution')) + "--in-conv-channels", + default=128, + type=int, + help=("number of output channels in input convolution"), + ) parser.add_argument( - '--in-kernel-size', default=3, type=int, - help=('kernel size of input convolution')) + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) - parser.add_argument('--in-stride', default=1, type=int, - help=('stride of input convolution')) + parser.add_argument( + "--in-stride", default=1, type=int, help=("stride of input convolution") + ) parser.add_argument( - '--conv-repeats', default=[1, 1, 1], type=int, - nargs='+', help=('conv-blocks repeats in each decoder stage')) + "--conv-repeats", + default=[1, 1, 1], + type=int, + nargs="+", + help=("conv-blocks repeats in each decoder stage"), + ) parser.add_argument( - '--conv-channels', default=[64, 128, 128], - type=int, nargs='+', - help=('conv-blocks channels for each decoder stage')) + "--conv-channels", + default=[64, 128, 128], + type=int, + nargs="+", + help=("conv-blocks channels for each decoder stage"), + ) parser.add_argument( - '--conv-kernel-sizes', default=[3], - nargs='+', type=int, help=('conv-blocks kernels for each decoder stage')) + "--conv-kernel-sizes", + default=[3], + nargs="+", + type=int, + help=("conv-blocks kernels for each decoder stage"), + ) parser.add_argument( - '--conv-strides', default=[2], - nargs='+', type=int, help=('conv-blocks strides for each decoder stage')) + "--conv-strides", + default=[2], + nargs="+", + type=int, + help=("conv-blocks strides for each decoder stage"), + ) parser.add_argument( - '--conv-dilations', default=[1], - nargs='+', type=int, help=('conv-blocks dilations for each decoder stage')) + "--conv-dilations", + default=[1], + nargs="+", + type=int, + help=("conv-blocks dilations for each decoder stage"), + ) if head_channels: parser.add_argument( - '--head-channels', type=int, required=True, - help=('channels in the last conv block of decoder')) + "--head-channels", + type=int, + required=True, + help=("channels in the last conv block of decoder"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - - parser.add_argument('--head-act', default=None, - help='activation in encoder head') - + + parser.add_argument( + "--head-act", default=None, help="activation in encoder head" + ) + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout probability') + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') - - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) + + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='DC1d decoder options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='DC1d decoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/dc1d_encoder.py b/hyperion/torch/narchs/dc1d_encoder.py index e4944f1c..091629f4 100644 --- a/hyperion/torch/narchs/dc1d_encoder.py +++ b/hyperion/torch/narchs/dc1d_encoder.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ from jsonargparse import ArgumentParser, ActionParser -import math +import math import torch import torch.nn as nn @@ -15,23 +15,25 @@ class DC1dEncoder(NetArch): - - def __init__(self, in_feats, - in_conv_channels=128, - in_kernel_size=3, - in_stride=1, - conv_repeats=[1, 1, 1], - conv_channels=[128, 64, 32], - conv_kernel_sizes=3, - conv_strides=2, - conv_dilations=1, - head_channels=0, - hid_act='relu6', - head_act=None, - dropout_rate=0, - use_norm=True, - norm_layer=None, - norm_before=True): + def __init__( + self, + in_feats, + in_conv_channels=128, + in_kernel_size=3, + in_stride=1, + conv_repeats=[1, 1, 1], + conv_channels=[128, 64, 32], + conv_kernel_sizes=3, + conv_strides=2, + conv_dilations=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.in_feats = in_feats @@ -41,13 +43,17 @@ def __init__(self, in_feats, num_superblocks = len(conv_repeats) self.conv_repeats = conv_repeats self.conv_channels = self._standarize_convblocks_param( - conv_channels, num_superblocks, 'conv_channels') + conv_channels, num_superblocks, "conv_channels" + ) self.conv_kernel_sizes = self._standarize_convblocks_param( - conv_kernel_sizes, num_superblocks, 'conv_kernel_sizes') + conv_kernel_sizes, num_superblocks, "conv_kernel_sizes" + ) self.conv_strides = self._standarize_convblocks_param( - conv_strides, num_superblocks, 'conv_strides') + conv_strides, num_superblocks, "conv_strides" + ) self.conv_dilations = self._standarize_convblocks_param( - conv_dilations, num_superblocks, 'conv_dilations') + conv_dilations, num_superblocks, "conv_dilations" + ) self.head_channels = head_channels self.hid_act = hid_act self.head_act = head_act @@ -57,23 +63,28 @@ def __init__(self, in_feats, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(self.conv_channels)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block self.in_block = DC1dEncBlock( - in_feats, in_conv_channels, in_kernel_size, - stride=in_stride, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) + in_feats, + in_conv_channels, + in_kernel_size, + stride=in_stride, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) self._context = self.in_block.context self._downsample_factor = self.in_block.stride cur_in_channels = in_conv_channels - - #middle blocks + + # middle blocks self.blocks = nn.ModuleList([]) for i in range(num_superblocks): repeats_i = self.conv_repeats[i] @@ -82,58 +93,76 @@ def __init__(self, in_feats, kernel_size_i = self.conv_kernel_sizes[i] dilation_i = self.conv_dilations[i] block_i = DC1dEncBlock( - cur_in_channels, channels_i, kernel_size_i, - stride=stride_i, dilation=1, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + cur_in_channels, + channels_i, + kernel_size_i, + stride=stride_i, + dilation=1, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._downsample_factor self._downsample_factor *= block_i.stride - for j in range(repeats_i-1): + for j in range(repeats_i - 1): block_i = DC1dEncBlock( - channels_i, channels_i, kernel_size_i, - stride=1, dilation=dilation_i, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + channels_i, + channels_i, + kernel_size_i, + stride=1, + dilation=dilation_i, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._downsample_factor cur_in_channels = channels_i - #head feature block + # head feature block if self.head_channels > 0: self.head_block = DC1dEncBlock( - cur_in_channels, head_channels, kernel_size=1, - stride=1, activation=head_act, - use_norm=False, norm_before=norm_before) + cur_in_channels, + head_channels, + kernel_size=1, + stride=1, + activation=head_act, + use_norm=False, + norm_before=norm_before, + ) self._init_weights(hid_act) - - def _init_weights(self, hid_act): for m in self.modules(): if isinstance(m, nn.Conv1d): if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name == 'swish': - act_name = 'relu' + act_name = hid_act["name"] + if act_name == "swish": + act_name = "relu" try: - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity=act_name) + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity=act_name + ) except: - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) elif isinstance(m, nn.BatchNorm1d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) - @staticmethod def _standarize_convblocks_param(p, num_blocks, p_name): if isinstance(p, int): @@ -141,35 +170,36 @@ def _standarize_convblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p - def _compute_out_size(self, in_size): - out_size = int((in_size - 1)//self.in_stride+1) + out_size = int((in_size - 1) // self.in_stride + 1) for stride in self.conv_strides: - out_size = int((out_size - 1)//stride+1) + out_size = int((out_size - 1) // stride + 1) return out_size - def in_context(self): return (self._context, self._context) - def in_shape(self): return (None, self.in_feats, None) - def out_shape(self, in_shape=None): - out_channels = self.head_channels if self.head_channels>0 else self.conv_channels[-1] + out_channels = ( + self.head_channels if self.head_channels > 0 else self.conv_channels[-1] + ) if in_shape is None: return (None, out_channels, None) @@ -181,8 +211,6 @@ def out_shape(self, in_shape=None): return (in_shape[0], out_chanels, T) - - def forward(self, x): x = self.in_block(x) @@ -194,144 +222,192 @@ def forward(self, x): return x - def get_config(self): - + head_act = self.head_act hid_act = self.hid_act - config = {'in_feats': self.in_feats, - 'in_conv_channels': self.in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'conv_repeats': self.conv_repeats, - 'conv_channels': self.conv_channels, - 'conv_kernel_sizes': self.conv_kernel_sizes, - 'conv_strides': self.conv_strides, - 'conv_dilations': self.conv_dilations, - 'head_channels': self.head_channels, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act, - 'head_act': head_act, - 'use_norm': self.use_norm, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - } - + config = { + "in_feats": self.in_feats, + "in_conv_channels": self.in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "conv_repeats": self.conv_repeats, + "conv_channels": self.conv_channels, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "conv_dilations": self.conv_dilations, + "head_channels": self.head_channels, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + "head_act": head_act, + "use_norm": self.use_norm, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - - valid_args = ('in_feats', - 'in_conv_channels', 'in_kernel_size', 'in_stride', - 'conv_repeats', 'conv_channels', 'conv_kernel_sizes', - 'conv_strides', 'conv_dilations', - 'head_channels', - 'hid_act', 'had_act', - 'dropout_rate', - 'use_norm', 'norm_layer', 'norm_before') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "in_feats", + "in_conv_channels", + "in_kernel_size", + "in_stride", + "conv_repeats", + "conv_channels", + "conv_kernel_sizes", + "conv_strides", + "conv_dilations", + "head_channels", + "hid_act", + "had_act", + "dropout_rate", + "use_norm", + "norm_layer", + "norm_before", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None, head_channels=False, in_feats=False): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") if in_feats: parser.add_argument( - '--in-feats', type=int, required=True, - help=('input feature dimension')) + "--in-feats", type=int, required=True, help=("input feature dimension") + ) parser.add_argument( - '--in-conv-channels', default=128, type=int, - help=('number of output channels in input convolution')) + "--in-conv-channels", + default=128, + type=int, + help=("number of output channels in input convolution"), + ) parser.add_argument( - '--in-kernel-size', default=3, type=int, - help=('kernel size of input convolution')) + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) - parser.add_argument('--in-stride', default=1, type=int, - help=('stride of input convolution')) + parser.add_argument( + "--in-stride", default=1, type=int, help=("stride of input convolution") + ) parser.add_argument( - '--conv-repeats', default=[1, 1, 1], type=int, - nargs='+', help=('conv-blocks repeats in each encoder stage')) + "--conv-repeats", + default=[1, 1, 1], + type=int, + nargs="+", + help=("conv-blocks repeats in each encoder stage"), + ) parser.add_argument( - '--conv-channels', default=[128, 64, 32], - type=int, nargs='+', - help=('conv-blocks channels for each stage')) + "--conv-channels", + default=[128, 64, 32], + type=int, + nargs="+", + help=("conv-blocks channels for each stage"), + ) parser.add_argument( - '--conv-kernel-sizes', default=[3], - nargs='+', type=int, help=('conv-blocks kernels for each encoder stage')) + "--conv-kernel-sizes", + default=[3], + nargs="+", + type=int, + help=("conv-blocks kernels for each encoder stage"), + ) parser.add_argument( - '--conv-strides', default=[2], - nargs='+', type=int, help=('conv-blocks strides for each encoder stage')) + "--conv-strides", + default=[2], + nargs="+", + type=int, + help=("conv-blocks strides for each encoder stage"), + ) parser.add_argument( - '--conv-dilations', default=[1], - nargs='+', type=int, help=('conv-blocks dilations for each encoder stage')) + "--conv-dilations", + default=[1], + nargs="+", + type=int, + help=("conv-blocks dilations for each encoder stage"), + ) if head_channels: parser.add_argument( - '--head-channels', default=16, type=int, - help=('channels in the last conv block of encoder')) + "--head-channels", + default=16, + type=int, + help=("channels in the last conv block of encoder"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - - parser.add_argument('--head-act', default=None, - help='activation in encoder head') - + + parser.add_argument( + "--head-act", default=None, help="activation in encoder head" + ) + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout probability') + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') - - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) + + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='DC1d encoder options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='DC1d encoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/dc2d_decoder.py b/hyperion/torch/narchs/dc2d_decoder.py index ca46b893..6ad7c4c9 100644 --- a/hyperion/torch/narchs/dc2d_decoder.py +++ b/hyperion/torch/narchs/dc2d_decoder.py @@ -4,7 +4,7 @@ """ from jsonargparse import ArgumentParser, ActionParser -import math +import math import torch import torch.nn as nn @@ -17,23 +17,25 @@ class DC2dDecoder(NetArch): - - def __init__(self, in_channels=32, - in_conv_channels=32, - in_kernel_size=3, - in_stride=1, - conv_repeats=[1, 1, 1], - conv_channels=[64, 128, 128], - conv_kernel_sizes=3, - conv_strides=2, - conv_dilations=1, - head_channels=0, - hid_act='relu6', - head_act=None, - dropout_rate=0, - use_norm=True, - norm_layer=None, - norm_before=True): + def __init__( + self, + in_channels=32, + in_conv_channels=32, + in_kernel_size=3, + in_stride=1, + conv_repeats=[1, 1, 1], + conv_channels=[64, 128, 128], + conv_kernel_sizes=3, + conv_strides=2, + conv_dilations=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.in_channels = in_channels @@ -43,13 +45,17 @@ def __init__(self, in_channels=32, num_superblocks = len(conv_repeats) self.conv_repeats = conv_repeats self.conv_channels = self._standarize_convblocks_param( - conv_channels, num_superblocks, 'conv_channels') + conv_channels, num_superblocks, "conv_channels" + ) self.conv_kernel_sizes = self._standarize_convblocks_param( - conv_kernel_sizes, num_superblocks, 'conv_kernel_sizes') + conv_kernel_sizes, num_superblocks, "conv_kernel_sizes" + ) self.conv_strides = self._standarize_convblocks_param( - conv_strides, num_superblocks, 'conv_strides') + conv_strides, num_superblocks, "conv_strides" + ) self.conv_dilations = self._standarize_convblocks_param( - conv_dilations, num_superblocks, 'conv_dilations') + conv_dilations, num_superblocks, "conv_dilations" + ) self.head_channels = head_channels self.hid_act = hid_act self.head_act = head_act @@ -59,23 +65,28 @@ def __init__(self, in_channels=32, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(self.conv_channels)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block self.in_block = DC2dDecBlock( - in_channels, in_conv_channels, in_kernel_size, - stride=in_stride, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) + in_channels, + in_conv_channels, + in_kernel_size, + stride=in_stride, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) self._context = self.in_block.context self._upsample_factor = self.in_block.stride cur_in_channels = in_conv_channels - - #middle blocks + + # middle blocks self.blocks = nn.ModuleList([]) for i in range(num_superblocks): repeats_i = self.conv_repeats[i] @@ -84,50 +95,69 @@ def __init__(self, in_channels=32, kernel_size_i = self.conv_kernel_sizes[i] dilation_i = self.conv_dilations[i] block_i = DC2dDecBlock( - cur_in_channels, channels_i, kernel_size_i, - stride=stride_i, dilation=1, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + cur_in_channels, + channels_i, + kernel_size_i, + stride=stride_i, + dilation=1, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor self._upsample_factor *= block_i.stride - for j in range(repeats_i-1): + for j in range(repeats_i - 1): block_i = DC2dDecBlock( - channels_i, channels_i, kernel_size_i, - stride=1, dilation=dilation_i, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + channels_i, + channels_i, + kernel_size_i, + stride=1, + dilation=dilation_i, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor cur_in_channels = channels_i - #head feature block + # head feature block if self.head_channels > 0: self.head_block = DC2dDecBlock( - cur_in_channels, head_channels, kernel_size=1, - stride=1, activation=head_act, - use_norm=False, norm_before=norm_before) + cur_in_channels, + head_channels, + kernel_size=1, + stride=1, + activation=head_act, + use_norm=False, + norm_before=norm_before, + ) self._init_weights(hid_act) - - def _init_weights(self, hid_act): if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name in ['relu6', 'swish']: - act_name = 'relu' + act_name = hid_act["name"] + if act_name in ["relu6", "swish"]: + act_name = "relu" - init_f1 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity=act_name) - init_f2 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity='relu') + init_f1 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity=act_name + ) + init_f2 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity="relu" + ) for m in self.modules(): if isinstance(m, nn.Conv2d): @@ -139,7 +169,7 @@ def _init_weights(self, hid_act): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) - #re-init subpixelconvs + # re-init subpixelconvs for m in self.modules(): if isinstance(m, SubPixelConv2d): try: @@ -161,7 +191,6 @@ def _init_weights(self, hid_act): # nn.init.constant_(m.weight, 1) # nn.init.constant_(m.bias, 0) - @staticmethod def _standarize_convblocks_param(p, num_blocks, p_name): if isinstance(p, int): @@ -169,36 +198,37 @@ def _standarize_convblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p - def _compute_out_size(self, in_size): out_size = in_size * in_stride for stride in self.conv_strides: - out_size *= stride + out_size *= stride return out_size - def in_context(self): - in_context = int(math.ceil(self._context/self._upsample_factor)) + in_context = int(math.ceil(self._context / self._upsample_factor)) return (in_context, in_context) - def in_shape(self): return (None, self.in_channels, None, None) - def out_shape(self, in_shape=None): - out_channels = self.head_channels if self.head_channels>0 else self.conv_channels[-1] + out_channels = ( + self.head_channels if self.head_channels > 0 else self.conv_channels[-1] + ) if in_shape is None: return (None, out_channels, None, None) @@ -215,21 +245,17 @@ def out_shape(self, in_shape=None): return (in_shape[0], out_chanels, H, W) - - def _match_shape(self, x, target_shape): x_dim = x.dim() ddim = x_dim - len(target_shape) for i in range(2, x_dim): - surplus = x.size(i) - target_shape[i-ddim] + surplus = x.size(i) - target_shape[i - ddim] assert surplus >= 0 if surplus > 0: - x = torch.narrow(x, i, surplus//2, target_shape[i-ddim]) + x = torch.narrow(x, i, surplus // 2, target_shape[i - ddim]) return x.contiguous() - - def forward(self, x, target_shape=None): x = self.in_block(x) @@ -244,143 +270,191 @@ def forward(self, x, target_shape=None): return x - def get_config(self): - + head_act = self.head_act hid_act = self.hid_act - config = {'in_channels': self.in_channels, - 'in_conv_channels': self.in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'conv_repeats': self.conv_repeats, - 'conv_channels': self.conv_channels, - 'conv_kernel_sizes': self.conv_kernel_sizes, - 'conv_strides': self.conv_strides, - 'conv_dilations': self.conv_dilations, - 'head_channels': self.head_channels, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act, - 'head_act': head_act, - 'use_norm': self.use_norm, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - } - + config = { + "in_channels": self.in_channels, + "in_conv_channels": self.in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "conv_repeats": self.conv_repeats, + "conv_channels": self.conv_channels, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "conv_dilations": self.conv_dilations, + "head_channels": self.head_channels, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + "head_act": head_act, + "use_norm": self.use_norm, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - - valid_args = ('in_channels', - 'in_conv_channels', 'in_kernel_size', 'in_stride', - 'conv_repeats', 'conv_channels', 'conv_kernel_sizes', - 'conv_strides', 'conv_dilations', - 'head_channels', - 'hid_act', 'had_act', - 'dropout_rate', - 'use_norm', 'norm_layer', 'norm_before') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "in_channels", + "in_conv_channels", + "in_kernel_size", + "in_stride", + "conv_repeats", + "conv_channels", + "conv_kernel_sizes", + "conv_strides", + "conv_dilations", + "head_channels", + "hid_act", + "had_act", + "dropout_rate", + "use_norm", + "norm_layer", + "norm_before", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None, head_channels=False): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--in-channels', type=int, default=80, - help=('input channels of decoder')) + "--in-channels", type=int, default=80, help=("input channels of decoder") + ) parser.add_argument( - '--in-conv-channels', default=128, type=int, - help=('number of output channels in input convolution')) + "--in-conv-channels", + default=128, + type=int, + help=("number of output channels in input convolution"), + ) parser.add_argument( - '--in-kernel-size', default=3, type=int, - help=('kernel size of input convolution')) + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) - parser.add_argument('--in-stride', default=1, type=int, - help=('stride of input convolution')) + parser.add_argument( + "--in-stride", default=1, type=int, help=("stride of input convolution") + ) parser.add_argument( - '--conv-repeats', default=[1, 1, 1], type=int, - nargs='+', help=('conv-blocks repeats in each decoder stage')) + "--conv-repeats", + default=[1, 1, 1], + type=int, + nargs="+", + help=("conv-blocks repeats in each decoder stage"), + ) parser.add_argument( - '--conv-channels', default=[64, 128, 128], - type=int, nargs='+', - help=('conv-blocks channels for each decoder stage')) + "--conv-channels", + default=[64, 128, 128], + type=int, + nargs="+", + help=("conv-blocks channels for each decoder stage"), + ) parser.add_argument( - '--conv-kernel-sizes', default=[3], - nargs='+', type=int, help=('conv-blocks kernels for each decoder stage')) + "--conv-kernel-sizes", + default=[3], + nargs="+", + type=int, + help=("conv-blocks kernels for each decoder stage"), + ) parser.add_argument( - '--conv-strides', default=[2], - nargs='+', type=int, help=('conv-blocks strides for each decoder stage')) + "--conv-strides", + default=[2], + nargs="+", + type=int, + help=("conv-blocks strides for each decoder stage"), + ) parser.add_argument( - '--conv-dilations', default=[1], - nargs='+', type=int, help=('conv-blocks dilations for each decoder stage')) + "--conv-dilations", + default=[1], + nargs="+", + type=int, + help=("conv-blocks dilations for each decoder stage"), + ) if head_channels: parser.add_argument( - '--head-channels', type=int, required=True, - help=('channels in the last conv block of decoder')) + "--head-channels", + type=int, + required=True, + help=("channels in the last conv block of decoder"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - - parser.add_argument('--head-act', default=None, - help='activation in encoder head') - + + parser.add_argument( + "--head-act", default=None, help="activation in encoder head" + ) + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout probability') + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') - - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) + + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='DC2d decoder options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='DC2d decoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/dc2d_encoder.py b/hyperion/torch/narchs/dc2d_encoder.py index 5d458bb5..c6857ff6 100644 --- a/hyperion/torch/narchs/dc2d_encoder.py +++ b/hyperion/torch/narchs/dc2d_encoder.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math +import math from jsonargparse import ArgumentParser, ActionParser import torch @@ -16,23 +16,25 @@ class DC2dEncoder(NetArch): - - def __init__(self, in_channels=1, - in_conv_channels=128, - in_kernel_size=3, - in_stride=1, - conv_repeats=[1, 1, 1], - conv_channels=[128, 64, 32], - conv_kernel_sizes=3, - conv_strides=2, - conv_dilations=1, - head_channels=0, - hid_act='relu6', - head_act=None, - dropout_rate=0, - use_norm=True, - norm_layer=None, - norm_before=True): + def __init__( + self, + in_channels=1, + in_conv_channels=128, + in_kernel_size=3, + in_stride=1, + conv_repeats=[1, 1, 1], + conv_channels=[128, 64, 32], + conv_kernel_sizes=3, + conv_strides=2, + conv_dilations=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.in_channels = in_channels @@ -42,13 +44,17 @@ def __init__(self, in_channels=1, num_superblocks = len(conv_repeats) self.conv_repeats = conv_repeats self.conv_channels = self._standarize_convblocks_param( - conv_channels, num_superblocks, 'conv_channels') + conv_channels, num_superblocks, "conv_channels" + ) self.conv_kernel_sizes = self._standarize_convblocks_param( - conv_kernel_sizes, num_superblocks, 'conv_kernel_sizes') + conv_kernel_sizes, num_superblocks, "conv_kernel_sizes" + ) self.conv_strides = self._standarize_convblocks_param( - conv_strides, num_superblocks, 'conv_strides') + conv_strides, num_superblocks, "conv_strides" + ) self.conv_dilations = self._standarize_convblocks_param( - conv_dilations, num_superblocks, 'conv_dilations') + conv_dilations, num_superblocks, "conv_dilations" + ) self.head_channels = head_channels self.hid_act = hid_act self.head_act = head_act @@ -58,23 +64,28 @@ def __init__(self, in_channels=1, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(self.conv_channels)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(self.conv_channels) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block self.in_block = DC2dEncBlock( - in_channels, in_conv_channels, in_kernel_size, - stride=in_stride, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) + in_channels, + in_conv_channels, + in_kernel_size, + stride=in_stride, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) self._context = self.in_block.context self._downsample_factor = self.in_block.stride cur_in_channels = in_conv_channels - - #middle blocks + + # middle blocks self.blocks = nn.ModuleList([]) for i in range(num_superblocks): repeats_i = self.conv_repeats[i] @@ -83,58 +94,76 @@ def __init__(self, in_channels=1, kernel_size_i = self.conv_kernel_sizes[i] dilation_i = self.conv_dilations[i] block_i = DC2dEncBlock( - cur_in_channels, channels_i, kernel_size_i, - stride=stride_i, dilation=1, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + cur_in_channels, + channels_i, + kernel_size_i, + stride=stride_i, + dilation=1, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._downsample_factor self._downsample_factor *= block_i.stride - for j in range(repeats_i-1): + for j in range(repeats_i - 1): block_i = DC2dEncBlock( - channels_i, channels_i, kernel_size_i, - stride=1, dilation=dilation_i, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) - + channels_i, + channels_i, + kernel_size_i, + stride=1, + dilation=dilation_i, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) + self.blocks.append(block_i) self._context += block_i.context * self._downsample_factor cur_in_channels = channels_i - #head feature block + # head feature block if self.head_channels > 0: self.head_block = DC2dEncBlock( - cur_in_channels, head_channels, kernel_size=1, - stride=1, activation=head_act, - use_norm=False, norm_before=norm_before) + cur_in_channels, + head_channels, + kernel_size=1, + stride=1, + activation=head_act, + use_norm=False, + norm_before=norm_before, + ) self._init_weights(hid_act) - - def _init_weights(self, hid_act): for m in self.modules(): if isinstance(m, nn.Conv2d): if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name == 'swish': - act_name = 'relu' + act_name = hid_act["name"] + if act_name == "swish": + act_name = "relu" try: - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity=act_name) + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity=act_name + ) except: - nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu') + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) elif isinstance(m, nn.BatchNorm2d): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) - @staticmethod def _standarize_convblocks_param(p, num_blocks, p_name): if isinstance(p, int): @@ -142,35 +171,36 @@ def _standarize_convblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p - def _compute_out_size(self, in_size): - out_size = int((in_size - 1)//self.in_stride+1) + out_size = int((in_size - 1) // self.in_stride + 1) for stride in self.conv_strides: - out_size = int((out_size - 1)//stride+1) + out_size = int((out_size - 1) // stride + 1) return out_size - def in_context(self): return (self._context, self._context) - def in_shape(self): return (None, self.in_channels, None, None) - def out_shape(self, in_shape=None): - out_channels = self.head_channels if self.head_channels>0 else self.conv_channels[-1] + out_channels = ( + self.head_channels if self.head_channels > 0 else self.conv_channels[-1] + ) if in_shape is None: return (None, out_channels, None, None) @@ -187,8 +217,6 @@ def out_shape(self, in_shape=None): return (in_shape[0], out_chanels, H, W) - - def forward(self, x): x = self.in_block(x) @@ -200,142 +228,191 @@ def forward(self, x): return x - def get_config(self): - + head_act = self.head_act hid_act = self.hid_act - config = {'in_channels': self.in_channels, - 'in_conv_channels': self.in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'conv_repeats': self.conv_repeats, - 'conv_channels': self.conv_channels, - 'conv_kernel_sizes': self.conv_kernel_sizes, - 'conv_strides': self.conv_strides, - 'conv_dilations': self.conv_dilations, - 'head_channels': self.head_channels, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act, - 'head_act': head_act, - 'use_norm': self.use_norm, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - } - + config = { + "in_channels": self.in_channels, + "in_conv_channels": self.in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "conv_repeats": self.conv_repeats, + "conv_channels": self.conv_channels, + "conv_kernel_sizes": self.conv_kernel_sizes, + "conv_strides": self.conv_strides, + "conv_dilations": self.conv_dilations, + "head_channels": self.head_channels, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + "head_act": head_act, + "use_norm": self.use_norm, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - valid_args = ('in_channels', - 'in_conv_channels', 'in_kernel_size', 'in_stride', - 'conv_repeats', 'conv_channels', 'conv_kernel_sizes', - 'conv_strides', 'conv_dilations', - 'head_channels', - 'hid_act', 'had_act', - 'dropout_rate', - 'use_norm', 'norm_layer', 'norm_before') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "in_channels", + "in_conv_channels", + "in_kernel_size", + "in_stride", + "conv_repeats", + "conv_channels", + "conv_kernel_sizes", + "conv_strides", + "conv_dilations", + "head_channels", + "hid_act", + "had_act", + "dropout_rate", + "use_norm", + "norm_layer", + "norm_before", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None, head_channels=False): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--in-channels', type=int, default=1, - help=('input channel dimension')) + "--in-channels", type=int, default=1, help=("input channel dimension") + ) parser.add_argument( - '--in-conv-channels', default=128, type=int, - help=('number of output channels in input convolution')) + "--in-conv-channels", + default=128, + type=int, + help=("number of output channels in input convolution"), + ) parser.add_argument( - '--in-kernel-size', default=3, type=int, - help=('kernel size of input convolution')) + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) - parser.add_argument('--in-stride', default=1, type=int, - help=('stride of input convolution')) + parser.add_argument( + "--in-stride", default=1, type=int, help=("stride of input convolution") + ) parser.add_argument( - '--conv-repeats', default=[1, 1, 1], type=int, - nargs='+', help=('conv-blocks repeats in each encoder stage')) + "--conv-repeats", + default=[1, 1, 1], + type=int, + nargs="+", + help=("conv-blocks repeats in each encoder stage"), + ) parser.add_argument( - '--conv-channels', default=[128, 64, 32], - type=int, nargs='+', - help=('conv-blocks channels for each stage')) + "--conv-channels", + default=[128, 64, 32], + type=int, + nargs="+", + help=("conv-blocks channels for each stage"), + ) parser.add_argument( - '--conv-kernel-sizes', default=[3], - nargs='+', type=int, help=('conv-blocks kernels for each encoder stage')) + "--conv-kernel-sizes", + default=[3], + nargs="+", + type=int, + help=("conv-blocks kernels for each encoder stage"), + ) parser.add_argument( - '--conv-strides', default=[2], - nargs='+', type=int, help=('conv-blocks strides for each encoder stage')) + "--conv-strides", + default=[2], + nargs="+", + type=int, + help=("conv-blocks strides for each encoder stage"), + ) parser.add_argument( - '--conv-dilations', default=[1], - nargs='+', type=int, help=('conv-blocks dilations for each encoder stage')) + "--conv-dilations", + default=[1], + nargs="+", + type=int, + help=("conv-blocks dilations for each encoder stage"), + ) if head_channels: parser.add_argument( - '--head-channels', default=16, type=int, - help=('channels in the last conv block of encoder')) + "--head-channels", + default=16, + type=int, + help=("channels in the last conv block of encoder"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - - parser.add_argument('--head-act', default=None, - help='activation in encoder head') - + + parser.add_argument( + "--head-act", default=None, help="activation in encoder head" + ) + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout probability') + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') - - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) + + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='DC2d encoder options') + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='DC2d encoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/etdnn.py b/hyperion/torch/narchs/etdnn.py index 25f01d0a..ebc14534 100644 --- a/hyperion/torch/narchs/etdnn.py +++ b/hyperion/torch/narchs/etdnn.py @@ -16,14 +16,24 @@ class ETDNNV1(NetArch): - - def __init__(self, num_blocks, - in_units, hid_units, out_units=0, - kernel_size=3, dilation=1, dilation_factor=1, - hid_act={'name':'relu', 'inplace':True}, out_act=None, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=True, in_norm=True, - pooling=None): + def __init__( + self, + num_blocks, + in_units, + hid_units, + out_units=0, + kernel_size=3, + dilation=1, + dilation_factor=1, + hid_act={"name": "relu", "inplace": True}, + out_act=None, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=True, + pooling=None, + ): super().__init__() @@ -55,26 +65,34 @@ def __init__(self, num_blocks, if isinstance(dilation, list): assert num_blocks == len(dilation) else: - dilation = [dilation_factor*i+dilation for i in range(num_blocks)] + dilation = [dilation_factor * i + dilation for i in range(num_blocks)] # past and future context - self._context = int(np.sum(np.array(dilation)*( - np.array(kernel_size)-1)/2)) + self._context = int( + np.sum(np.array(dilation) * (np.array(kernel_size) - 1) / 2) + ) self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(hid_units)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(hid_units) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) blocks = [] for i in range(num_blocks): blocks.append( - ETDNNBlock(units[i], units[i+1], - kernel_size=kernel_size[i], dilation=dilation[i], - activation=hid_act, dropout_rate=dropout_rate, - norm_layer=self._norm_layer, - use_norm=use_norm, norm_before=norm_before)) + ETDNNBlock( + units[i], + units[i + 1], + kernel_size=kernel_size[i], + dilation=dilation[i], + activation=hid_act, + dropout_rate=dropout_rate, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) self.blocks = nn.ModuleList(blocks) @@ -82,19 +100,17 @@ def __init__(self, num_blocks, if out_units == 0: self.out_act = None self.output = None - return + return self.with_output = True self.out_act = AF.create(out_act) self.output = Linear(units[-1], out_units) - @property def in_context(self): return (self._context, self._context) - - + def forward(self, x): for i in range(self.num_blocks): @@ -102,12 +118,12 @@ def forward(self, x): if self.with_output: if self.pooling is not None: - if self.pooling == 'mean': + if self.pooling == "mean": x = torch.mean(x, dim=2) - elif self.pooling == 'max': + elif self.pooling == "max": x = torch.max(x, dim=2) else: - raise Exception('pooling=%s not implemented' % (self.pooling)) + raise Exception("pooling=%s not implemented" % (self.pooling)) else: x = torch.transpose(x, 1, 2) @@ -117,36 +133,35 @@ def forward(self, x): return x - def get_config(self): - + out_act = AF.get_config(self.out_act) - hid_act = AF.get_config(self.blocks[0].activation1) - - config = {'num_blocks': self.num_blocks, - 'in_units': self.in_units, - 'hid_units': self.hid_units, - 'out_units': self.out_units, - 'kernel_size': self.kernel_size, - 'dilation': self.dilation, - 'dilation_factor': self.dilation_factor, - 'dropout_rate': self.dropout_rate, - 'norm_layer': self.norm_layer, - 'use_norm': self.use_norm, - 'norm_before': self.norm_before, - 'in_norm' : self.in_norm, - 'out_act': out_act, - 'hid_act': hid_act, - 'pooling': self.pooling } - + hid_act = AF.get_config(self.blocks[0].activation1) + + config = { + "num_blocks": self.num_blocks, + "in_units": self.in_units, + "hid_units": self.hid_units, + "out_units": self.out_units, + "kernel_size": self.kernel_size, + "dilation": self.dilation, + "dilation_factor": self.dilation_factor, + "dropout_rate": self.dropout_rate, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + "in_norm": self.in_norm, + "out_act": out_act, + "hid_act": hid_act, + "pooling": self.pooling, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def in_shape(self): return (None, self.in_units, None) - def out_shape(self, in_shape=None): if self.with_output: return (None, self.out_units) @@ -161,7 +176,3 @@ def out_shape(self, in_shape=None): assert len(in_shape) == 3 return (in_shape[0], out_units, in_shape[2]) - - - - diff --git a/hyperion/torch/narchs/net_arch.py b/hyperion/torch/narchs/net_arch.py index 22c2b76d..9a3fc65f 100644 --- a/hyperion/torch/narchs/net_arch.py +++ b/hyperion/torch/narchs/net_arch.py @@ -9,9 +9,8 @@ from ..torch_model import TorchModel -class NetArch(TorchModel): - +class NetArch(TorchModel): def in_context(self): return 0 @@ -26,4 +25,3 @@ def in_shape(self): def out_shape(self, in_shape=None): raise NotImplementedError() - diff --git a/hyperion/torch/narchs/resetdnn.py b/hyperion/torch/narchs/resetdnn.py index b1654368..2c7f3e00 100644 --- a/hyperion/torch/narchs/resetdnn.py +++ b/hyperion/torch/narchs/resetdnn.py @@ -16,14 +16,25 @@ class ResETDNNV1(NetArch): - - def __init__(self, num_blocks, - in_units, hid_units, expand_units, out_units=0, - kernel_size=3, dilation=1, dilation_factor=1, - hid_act={'name':'relu', 'inplace':True}, out_act=None, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=True, in_norm=True, - pooling=None): + def __init__( + self, + num_blocks, + in_units, + hid_units, + expand_units, + out_units=0, + kernel_size=3, + dilation=1, + dilation_factor=1, + hid_act={"name": "relu", "inplace": True}, + out_act=None, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=True, + pooling=None, + ): super().__init__() @@ -41,7 +52,7 @@ def __init__(self, num_blocks, self.in_norm = in_norm self.pooling = pooling - assert num_blocks > 2, 'ResETDNN requires at least 3 layer blocks' + assert num_blocks > 2, "ResETDNN requires at least 3 layer blocks" if isinstance(kernel_size, list): assert num_blocks == len(kernel_size) @@ -51,41 +62,62 @@ def __init__(self, num_blocks, if isinstance(dilation, list): assert num_blocks == len(dilation) else: - dilation = [dilation_factor*i+dilation for i in range(num_blocks)] + dilation = [dilation_factor * i + dilation for i in range(num_blocks)] # past and future context - self._context = int(np.sum(np.array(dilation)*( - np.array(kernel_size)-1)/2)) + self._context = int( + np.sum(np.array(dilation) * (np.array(kernel_size) - 1) / 2) + ) self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(hid_units)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(hid_units) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) blocks = [] for i in range(num_blocks): - if i==0: + if i == 0: blocks.append( - TDNNBlock(in_units, hid_units, - kernel_size=kernel_size[i], dilation=dilation[i], - activation=hid_act, dropout_rate=dropout_rate, - norm_layer=self._norm_layer, - use_norm=use_norm, norm_before=norm_before)) - elif i==num_blocks-1: + TDNNBlock( + in_units, + hid_units, + kernel_size=kernel_size[i], + dilation=dilation[i], + activation=hid_act, + dropout_rate=dropout_rate, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) + elif i == num_blocks - 1: blocks.append( - TDNNBlock(hid_units, expand_units, - kernel_size=kernel_size[i], dilation=dilation[i], - activation=hid_act, dropout_rate=dropout_rate, - norm_layer=self._norm_layer, - use_norm=use_norm, norm_before=norm_before)) + TDNNBlock( + hid_units, + expand_units, + kernel_size=kernel_size[i], + dilation=dilation[i], + activation=hid_act, + dropout_rate=dropout_rate, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) else: blocks.append( - ResETDNNBlock(hid_units, - kernel_size=kernel_size[i], dilation=dilation[i], - activation=hid_act, dropout_rate=dropout_rate, - norm_layer=self._norm_layer, - use_norm=use_norm, norm_before=norm_before)) + ResETDNNBlock( + hid_units, + kernel_size=kernel_size[i], + dilation=dilation[i], + activation=hid_act, + dropout_rate=dropout_rate, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) self.blocks = nn.ModuleList(blocks) @@ -93,19 +125,17 @@ def __init__(self, num_blocks, if out_units == 0: self.out_act = None self.output = None - return + return self.with_output = True self.out_act = AF.create(out_act) self.output = Linear(expand_units, out_units) - @property def in_context(self): return (self._context, self._context) - - + def forward(self, x): for i in range(self.num_blocks): @@ -113,12 +143,12 @@ def forward(self, x): if self.with_output: if self.pooling is not None: - if self.pooling == 'mean': + if self.pooling == "mean": x = torch.mean(x, dim=2) - elif self.pooling == 'max': + elif self.pooling == "max": x = torch.max(x, dim=2) else: - raise Exception('pooling=%s not implemented' % (self.pooling)) + raise Exception("pooling=%s not implemented" % (self.pooling)) else: x = torch.transpose(x, 1, 2) @@ -128,38 +158,36 @@ def forward(self, x): return x - def get_config(self): - + out_act = AF.get_config(self.out_act) - hid_act = AF.get_config(self.blocks[0].activation) - - config = {'num_blocks': self.num_blocks, - 'in_units': self.in_units, - 'hid_units': self.hid_units, - 'expand_units': self.expand_units, - 'out_units': self.out_units, - 'kernel_size': self.kernel_size, - 'dilation': self.dilation, - 'dilation_factor': self.dilation_factor, - 'dropout_rate': self.dropout_rate, - 'norm_layer': self.norm_layer, - 'use_norm': self.use_norm, - 'norm_before': self.norm_before, - 'in_norm' : self.in_norm, - 'out_act': out_act, - 'hid_act': hid_act, - 'pooling': self.pooling } - + hid_act = AF.get_config(self.blocks[0].activation) + + config = { + "num_blocks": self.num_blocks, + "in_units": self.in_units, + "hid_units": self.hid_units, + "expand_units": self.expand_units, + "out_units": self.out_units, + "kernel_size": self.kernel_size, + "dilation": self.dilation, + "dilation_factor": self.dilation_factor, + "dropout_rate": self.dropout_rate, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + "in_norm": self.in_norm, + "out_act": out_act, + "hid_act": hid_act, + "pooling": self.pooling, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - def in_shape(self): return (None, self.in_units, None) - def out_shape(self, in_shape=None): if self.with_output: return (None, self.out_units) @@ -169,5 +197,3 @@ def out_shape(self, in_shape=None): assert len(in_shape) == 3 return (in_shape[0], self.expand_units, in_shape[2]) - - diff --git a/hyperion/torch/narchs/resnet1d_decoder.py b/hyperion/torch/narchs/resnet1d_decoder.py index 4a545bce..e3290c71 100644 --- a/hyperion/torch/narchs/resnet1d_decoder.py +++ b/hyperion/torch/narchs/resnet1d_decoder.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ from jsonargparse import ArgumentParser, ActionParser -import math +import math import torch import torch.nn as nn @@ -17,41 +17,43 @@ class ResNet1dDecoder(NetArch): - - def __init__(self, in_channels=128, - in_conv_channels=128, - in_kernel_size=3, - in_stride=1, - resb_type='basic', - resb_repeats=[1, 1, 1], - resb_channels=128, - resb_kernel_sizes=3, - resb_strides=2, - resb_dilations=1, - resb_groups=1, - head_channels=0, - hid_act='relu6', - head_act=None, - dropout_rate=0, - se_r=16, - use_norm=True, - norm_layer=None, - norm_before=True): + def __init__( + self, + in_channels=128, + in_conv_channels=128, + in_kernel_size=3, + in_stride=1, + resb_type="basic", + resb_repeats=[1, 1, 1], + resb_channels=128, + resb_kernel_sizes=3, + resb_strides=2, + resb_dilations=1, + resb_groups=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + se_r=16, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.resb_type = resb_type - bargs = {} # block's extra arguments - if resb_type == 'basic': + bargs = {} # block's extra arguments + if resb_type == "basic": self._block = ResNet1dBasicDecBlock - elif resb_type == 'bn': + elif resb_type == "bn": self._block = ResNet1dBNDecBlock - elif resb_type == 'sebasic': + elif resb_type == "sebasic": self._block = SEResNet1dBasicDecBlock - bargs['se_r'] = se_r - elif resb_type == 'sebn': + bargs["se_r"] = se_r + elif resb_type == "sebn": self._block = SEResNet1dBNDecBlock - bargs['se_r'] = se_r + bargs["se_r"] = se_r self.in_channels = in_channels self.in_conv_channels = in_conv_channels @@ -60,13 +62,17 @@ def __init__(self, in_channels=128, num_superblocks = len(resb_repeats) self.resb_repeats = resb_repeats self.resb_channels = self._standarize_resblocks_param( - resb_channels, num_superblocks, 'resb_channels') + resb_channels, num_superblocks, "resb_channels" + ) self.resb_kernel_sizes = self._standarize_resblocks_param( - resb_kernel_sizes, num_superblocks, 'resb_kernel_sizes') + resb_kernel_sizes, num_superblocks, "resb_kernel_sizes" + ) self.resb_strides = self._standarize_resblocks_param( - resb_strides, num_superblocks, 'resb_strides') + resb_strides, num_superblocks, "resb_strides" + ) self.resb_dilations = self._standarize_resblocks_param( - resb_dilations, num_superblocks, 'resb_dilations') + resb_dilations, num_superblocks, "resb_dilations" + ) self.resb_groups = resb_groups self.head_channels = head_channels self.hid_act = hid_act @@ -78,24 +84,29 @@ def __init__(self, in_channels=128, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(resb_channels)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block self.in_block = DC1dDecBlock( - in_channels, in_conv_channels, in_kernel_size, - stride=in_stride, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) + in_channels, + in_conv_channels, + in_kernel_size, + stride=in_stride, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) self._context = self.in_block.context self._upsample_factor = self.in_block.stride cur_in_channels = in_conv_channels - - #middle blocks + + # middle blocks self.blocks = nn.ModuleList([]) for i in range(num_superblocks): repeats_i = self.resb_repeats[i] @@ -104,50 +115,73 @@ def __init__(self, in_channels=128, kernel_size_i = self.resb_kernel_sizes[i] dilation_i = self.resb_dilations[i] block_i = self._block( - cur_in_channels, channels_i, kernel_size_i, - stride=stride_i, dilation=1, groups=self.resb_groups, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before, **bargs) - + cur_in_channels, + channels_i, + kernel_size_i, + stride=stride_i, + dilation=1, + groups=self.resb_groups, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + **bargs + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor self._upsample_factor *= block_i.upsample_factor - for j in range(repeats_i-1): + for j in range(repeats_i - 1): block_i = self._block( - channels_i, channels_i, kernel_size_i, - stride=1, dilation=dilation_i, groups=self.resb_groups, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before, **bargs) - + channels_i, + channels_i, + kernel_size_i, + stride=1, + dilation=dilation_i, + groups=self.resb_groups, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + **bargs + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor cur_in_channels = channels_i - #head feature block + # head feature block if self.head_channels > 0: self.head_block = DC1dDecBlock( - cur_in_channels, head_channels, kernel_size=1, - stride=1, activation=head_act, - use_norm=False, norm_before=norm_before) + cur_in_channels, + head_channels, + kernel_size=1, + stride=1, + activation=head_act, + use_norm=False, + norm_before=norm_before, + ) self._init_weights(hid_act) - - def _init_weights(self, hid_act): if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name in ['relu6', 'swish']: - act_name = 'relu' + act_name = hid_act["name"] + if act_name in ["relu6", "swish"]: + act_name = "relu" - init_f1 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity=act_name) - init_f2 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity='relu') + init_f1 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity=act_name + ) + init_f2 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity="relu" + ) for m in self.modules(): if isinstance(m, nn.Conv1d): @@ -159,7 +193,7 @@ def _init_weights(self, hid_act): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) - #re-init subpixelconvs + # re-init subpixelconvs for m in self.modules(): if isinstance(m, SubPixelConv1d): try: @@ -183,7 +217,6 @@ def _init_weights(self, hid_act): # nn.init.constant_(m.weight, 1) # nn.init.constant_(m.bias, 0) - @staticmethod def _standarize_resblocks_param(p, num_blocks, p_name): if isinstance(p, int): @@ -191,11 +224,14 @@ def _standarize_resblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p @@ -203,23 +239,22 @@ def _compute_out_size(self, in_size): out_size = in_size * in_stride for stride in self.conv_strides: - out_size *= stride + out_size *= stride return out_size - def in_context(self): - in_context = int(math.ceil(self._context/self._upsample_factor)) + in_context = int(math.ceil(self._context / self._upsample_factor)) return (in_context, in_context) - def in_shape(self): return (None, self.in_channels, None) - def out_shape(self, in_shape=None): - out_channels = self.head_channels if self.head_channels>0 else self.resb_channels[-1] + out_channels = ( + self.head_channels if self.head_channels > 0 else self.resb_channels[-1] + ) if in_shape is None: return (None, out_channels, None) @@ -231,20 +266,16 @@ def out_shape(self, in_shape=None): return (in_shape[0], out_chanels, T) - - def _match_shape(self, x, target_shape): t = x.size(-1) target_t = target_shape[-1] surplus = t - target_t assert surplus >= 0 if surplus > 0: - x = torch.narrow(x, -1, surplus//2, target_t).contiguous() + x = torch.narrow(x, -1, surplus // 2, target_t).contiguous() return x - - def forward(self, x, target_shape=None): x = self.in_block(x) @@ -259,159 +290,217 @@ def forward(self, x, target_shape=None): return x - - def get_config(self): - + head_act = self.head_act hid_act = self.hid_act - config = {'in_channels': self.in_channels, - 'in_conv_channels': self.in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'resb_type': self.resb_type, - 'resb_repeats': self.resb_repeats, - 'resb_channels': self.resb_channels, - 'resb_kernel_sizes': self.resb_kernel_sizes, - 'resb_strides': self.resb_strides, - 'resb_dilations': self.resb_dilations, - 'resb_groups': self.resb_groups, - 'head_channels': self.head_channels, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act, - 'head_act': head_act, - 'se_r': self.se_r, - 'use_norm': self.use_norm, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - } - + config = { + "in_channels": self.in_channels, + "in_conv_channels": self.in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "resb_type": self.resb_type, + "resb_repeats": self.resb_repeats, + "resb_channels": self.resb_channels, + "resb_kernel_sizes": self.resb_kernel_sizes, + "resb_strides": self.resb_strides, + "resb_dilations": self.resb_dilations, + "resb_groups": self.resb_groups, + "head_channels": self.head_channels, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + "head_act": head_act, + "se_r": self.se_r, + "use_norm": self.use_norm, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - valid_args = ('in_channels', - 'in_conv_channels', 'in_kernel_size', 'in_stride', - 'resb_type', - 'resb_repeats', 'resb_channels', 'resb_kernel_sizes', - 'resb_strides', 'resb_dilations', 'resb_groups', - 'head_channels', 'se_r', - 'hid_act', 'head_act', - 'dropout_rate', - 'use_norm', 'norm-layer', 'norm_before') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "in_channels", + "in_conv_channels", + "in_kernel_size", + "in_stride", + "resb_type", + "resb_repeats", + "resb_channels", + "resb_kernel_sizes", + "resb_strides", + "resb_dilations", + "resb_groups", + "head_channels", + "se_r", + "hid_act", + "head_act", + "dropout_rate", + "use_norm", + "norm-layer", + "norm_before", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--in-channels', type=int, default=80, - help=('input channels of decoder')) + "--in-channels", type=int, default=80, help=("input channels of decoder") + ) parser.add_argument( - '--in-conv-channels', default=128, type=int, - help=('number of output channels in input convolution')) + "--in-conv-channels", + default=128, + type=int, + help=("number of output channels in input convolution"), + ) parser.add_argument( - '--in-kernel-size', default=3, type=int, - help=('kernel size of input convolution')) + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) - parser.add_argument('--in-stride', default=1, type=int, - help=('stride of input convolution')) + parser.add_argument( + "--in-stride", default=1, type=int, help=("stride of input convolution") + ) parser.add_argument( - '--resb-type', default='basic', - choices=['basic', 'bn'], help=('residual blocks type')) + "--resb-type", + default="basic", + choices=["basic", "bn"], + help=("residual blocks type"), + ) parser.add_argument( - '--resb-repeats', default=[1, 1, 1], type=int, - nargs='+', help=('resb-blocks repeats in each encoder stage')) + "--resb-repeats", + default=[1, 1, 1], + type=int, + nargs="+", + help=("resb-blocks repeats in each encoder stage"), + ) parser.add_argument( - '--resb-channels', default=[128, 64, 32], - type=int, nargs='+', - help=('resb-blocks channels for each stage')) + "--resb-channels", + default=[128, 64, 32], + type=int, + nargs="+", + help=("resb-blocks channels for each stage"), + ) parser.add_argument( - '--resb-kernel-sizes', default=[3], - nargs='+', type=int, help=('resb-blocks kernels for each encoder stage')) + "--resb-kernel-sizes", + default=[3], + nargs="+", + type=int, + help=("resb-blocks kernels for each encoder stage"), + ) parser.add_argument( - '--resb-strides', default=[2], - nargs='+', type=int, help=('resb-blocks strides for each encoder stage')) + "--resb-strides", + default=[2], + nargs="+", + type=int, + help=("resb-blocks strides for each encoder stage"), + ) parser.add_argument( - '--resb-dilations', default=[1], - nargs='+', type=int, help=('resb-blocks dilations for each encoder stage')) + "--resb-dilations", + default=[1], + nargs="+", + type=int, + help=("resb-blocks dilations for each encoder stage"), + ) parser.add_argument( - '--resb-groups', default=1, - type=int, help=('resb-blocks groups in convolutions')) + "--resb-groups", + default=1, + type=int, + help=("resb-blocks groups in convolutions"), + ) parser.add_argument( - '--head-channels', default=0, type=int, - help=('channels in the last conv block of encoder')) + "--head-channels", + default=0, + type=int, + help=("channels in the last conv block of encoder"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - - parser.add_argument('--head-act', default=None, - help='activation in encoder head') - + + parser.add_argument( + "--head-act", default=None, help="activation in encoder head" + ) + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout probability') + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') - - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) parser.add_argument( - '--se-r', default=16, type=int, - help=('squeeze-excitation compression ratio')) + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) - if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='ResNet1d decoder options') + parser.add_argument( + "--se-r", + default=16, + type=int, + help=("squeeze-excitation compression ratio"), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='ResNet1d decoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/resnet1d_encoder.py b/hyperion/torch/narchs/resnet1d_encoder.py index 195f3f44..78ceeac6 100644 --- a/hyperion/torch/narchs/resnet1d_encoder.py +++ b/hyperion/torch/narchs/resnet1d_encoder.py @@ -712,7 +712,10 @@ def add_class_args(parser, prefix=None, skip=set(["in_feats"])): ) parser.add_argument( - "--res2net-scale", default=1, type=int, help=("res2net scaling parameter "), + "--res2net-scale", + default=1, + type=int, + help=("res2net scaling parameter "), ) parser.add_argument( diff --git a/hyperion/torch/narchs/resnet2d_decoder.py b/hyperion/torch/narchs/resnet2d_decoder.py index 87e7f471..f5becf76 100644 --- a/hyperion/torch/narchs/resnet2d_decoder.py +++ b/hyperion/torch/narchs/resnet2d_decoder.py @@ -3,7 +3,7 @@ Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) """ -import math +import math from jsonargparse import ArgumentParser, ActionParser import torch @@ -18,41 +18,43 @@ class ResNet2dDecoder(NetArch): - - def __init__(self, in_channels=512, - in_conv_channels=512, - in_kernel_size=3, - in_stride=1, - resb_type='basic', - resb_repeats=[2, 2, 2, 2], - resb_channels=[512, 256, 128, 64], - resb_kernel_sizes=3, - resb_strides=2, - resb_dilations=1, - resb_groups=1, - head_channels=0, - hid_act='relu6', - head_act=None, - dropout_rate=0, - se_r=16, - use_norm=True, - norm_layer=None, - norm_before=True): + def __init__( + self, + in_channels=512, + in_conv_channels=512, + in_kernel_size=3, + in_stride=1, + resb_type="basic", + resb_repeats=[2, 2, 2, 2], + resb_channels=[512, 256, 128, 64], + resb_kernel_sizes=3, + resb_strides=2, + resb_dilations=1, + resb_groups=1, + head_channels=0, + hid_act="relu6", + head_act=None, + dropout_rate=0, + se_r=16, + use_norm=True, + norm_layer=None, + norm_before=True, + ): super().__init__() self.resb_type = resb_type bargs = {} - if resb_type == 'basic': + if resb_type == "basic": self._block = ResNet2dBasicDecBlock - elif resb_type == 'bn': + elif resb_type == "bn": self._block = ResNet2dBNDecBlock - elif resb_type == 'sebasic': + elif resb_type == "sebasic": self._block = SEResNet2dBasicDecBlock - bargs['se_r'] = se_r - elif resb_type == 'sebn': + bargs["se_r"] = se_r + elif resb_type == "sebn": self._block = SEResNet2dBNDecBlock - bargs['se_r'] = se_r + bargs["se_r"] = se_r self.in_channels = in_channels self.in_conv_channels = in_conv_channels @@ -61,13 +63,17 @@ def __init__(self, in_channels=512, num_superblocks = len(resb_repeats) self.resb_repeats = resb_repeats self.resb_channels = self._standarize_resblocks_param( - resb_channels, num_superblocks, 'resb_channels') + resb_channels, num_superblocks, "resb_channels" + ) self.resb_kernel_sizes = self._standarize_resblocks_param( - resb_kernel_sizes, num_superblocks, 'resb_kernel_sizes') + resb_kernel_sizes, num_superblocks, "resb_kernel_sizes" + ) self.resb_strides = self._standarize_resblocks_param( - resb_strides, num_superblocks, 'resb_strides') + resb_strides, num_superblocks, "resb_strides" + ) self.resb_dilations = self._standarize_resblocks_param( - resb_dilations, num_superblocks, 'resb_dilations') + resb_dilations, num_superblocks, "resb_dilations" + ) self.resb_groups = resb_groups self.head_channels = head_channels self.hid_act = hid_act @@ -79,24 +85,29 @@ def __init__(self, in_channels=512, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(resb_channels)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(resb_channels) // 2, 32) norm_groups = max(norm_groups, resb_groups) self._norm_layer = NLF.create(norm_layer, norm_groups) # stem block self.in_block = DC2dDecBlock( - in_channels, in_conv_channels, in_kernel_size, - stride=in_stride, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before) + in_channels, + in_conv_channels, + in_kernel_size, + stride=in_stride, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + ) self._context = self.in_block.context self._upsample_factor = self.in_block.stride cur_in_channels = in_conv_channels - - #middle blocks + + # middle blocks self.blocks = nn.ModuleList([]) for i in range(num_superblocks): repeats_i = self.resb_repeats[i] @@ -105,50 +116,73 @@ def __init__(self, in_channels=512, kernel_size_i = self.resb_kernel_sizes[i] dilation_i = self.resb_dilations[i] block_i = self._block( - cur_in_channels, channels_i, kernel_size_i, - stride=stride_i, dilation=1, groups=self.resb_groups, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before, **bargs) - + cur_in_channels, + channels_i, + kernel_size_i, + stride=stride_i, + dilation=1, + groups=self.resb_groups, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + **bargs + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor self._upsample_factor *= block_i.upsample_factor - for j in range(repeats_i-1): + for j in range(repeats_i - 1): block_i = self._block( - channels_i, channels_i, kernel_size_i, - stride=1, dilation=dilation_i, groups=self.resb_groups, - activation=hid_act, dropout_rate=dropout_rate, - use_norm=use_norm, norm_layer=self._norm_layer, - norm_before=norm_before, **bargs) - + channels_i, + channels_i, + kernel_size_i, + stride=1, + dilation=dilation_i, + groups=self.resb_groups, + activation=hid_act, + dropout_rate=dropout_rate, + use_norm=use_norm, + norm_layer=self._norm_layer, + norm_before=norm_before, + **bargs + ) + self.blocks.append(block_i) self._context += block_i.context * self._upsample_factor cur_in_channels = channels_i - #head feature block + # head feature block if self.head_channels > 0: self.head_block = DC2dDecBlock( - cur_in_channels, head_channels, kernel_size=1, - stride=1, activation=head_act, - use_norm=False, norm_before=norm_before) + cur_in_channels, + head_channels, + kernel_size=1, + stride=1, + activation=head_act, + use_norm=False, + norm_before=norm_before, + ) self._init_weights(hid_act) - - def _init_weights(self, hid_act): if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name in ['relu6', 'swish']: - act_name = 'relu' + act_name = hid_act["name"] + if act_name in ["relu6", "swish"]: + act_name = "relu" - init_f1 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity=act_name) - init_f2 = lambda x: nn.init.kaiming_normal_(x, mode='fan_out', nonlinearity='relu') + init_f1 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity=act_name + ) + init_f2 = lambda x: nn.init.kaiming_normal_( + x, mode="fan_out", nonlinearity="relu" + ) for m in self.modules(): if isinstance(m, nn.Conv2d): @@ -160,7 +194,7 @@ def _init_weights(self, hid_act): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) - #re-init subpixelconvs + # re-init subpixelconvs for m in self.modules(): if isinstance(m, SubPixelConv2d): try: @@ -184,7 +218,6 @@ def _init_weights(self, hid_act): # nn.init.constant_(m.weight, 1) # nn.init.constant_(m.bias, 0) - @staticmethod def _standarize_resblocks_param(p, num_blocks, p_name): if isinstance(p, int): @@ -192,11 +225,14 @@ def _standarize_resblocks_param(p, num_blocks, p_name): elif isinstance(p, list): if len(p) == 1: p = p * num_blocks - - assert len(p) == num_blocks, ( - 'len(%s)(%d)!=%d' % (p_name, len(p), num_blocks)) + + assert len(p) == num_blocks, "len(%s)(%d)!=%d" % ( + p_name, + len(p), + num_blocks, + ) else: - raise TypeError('wrong type for param {}={}'.format(p_name, p)) + raise TypeError("wrong type for param {}={}".format(p_name, p)) return p @@ -204,23 +240,22 @@ def _compute_out_size(self, in_size): out_size = in_size * in_stride for stride in self.conv_strides: - out_size *= stride + out_size *= stride return out_size - def in_context(self): - in_context = int(math.ceil(self._context/self._upsample_factor)) + in_context = int(math.ceil(self._context / self._upsample_factor)) return (in_context, in_context) - def in_shape(self): return (None, self.in_channels, None, None) - def out_shape(self, in_shape=None): - out_channels = self.head_channels if self.head_channels>0 else self.resb_channels[-1] + out_channels = ( + self.head_channels if self.head_channels > 0 else self.resb_channels[-1] + ) if in_shape is None: return (None, out_channels, None, None) @@ -237,19 +272,17 @@ def out_shape(self, in_shape=None): return (in_shape[0], out_chanels, H, W) - def _match_shape(self, x, target_shape): x_dim = x.dim() ddim = x_dim - len(target_shape) for i in range(2, x_dim): - surplus = x.size(i) - target_shape[i-ddim] + surplus = x.size(i) - target_shape[i - ddim] assert surplus >= 0 if surplus > 0: - x = torch.narrow(x, i, surplus//2, target_shape[i-ddim]) + x = torch.narrow(x, i, surplus // 2, target_shape[i - ddim]) return x.contiguous() - def forward(self, x, target_shape=None): x = self.in_block(x) @@ -264,158 +297,217 @@ def forward(self, x, target_shape=None): return x - def get_config(self): - + head_act = self.head_act hid_act = self.hid_act - config = {'in_channels': self.in_channels, - 'in_conv_channels': self.in_conv_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'resb_type': self.resb_type, - 'resb_repeats': self.resb_repeats, - 'resb_channels': self.resb_channels, - 'resb_kernel_sizes': self.resb_kernel_sizes, - 'resb_strides': self.resb_strides, - 'resb_dilations': self.resb_dilations, - 'resb_groups': self.resb_groups, - 'head_channels': self.head_channels, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act, - 'head_act': head_act, - 'se_r': self.se_r, - 'use_norm': self.use_norm, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - } - + config = { + "in_channels": self.in_channels, + "in_conv_channels": self.in_conv_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "resb_type": self.resb_type, + "resb_repeats": self.resb_repeats, + "resb_channels": self.resb_channels, + "resb_kernel_sizes": self.resb_kernel_sizes, + "resb_strides": self.resb_strides, + "resb_dilations": self.resb_dilations, + "resb_groups": self.resb_groups, + "head_channels": self.head_channels, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + "head_act": head_act, + "se_r": self.se_r, + "use_norm": self.use_norm, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - @staticmethod def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - valid_args = ('in_channels', - 'in_conv_channels', 'in_kernel_size', 'in_stride', - 'resb_type', - 'resb_repeats', 'resb_channels', 'resb_kernel_sizes', - 'resb_strides', 'resb_dilations', 'resb_groups', - 'head_channels', 'se_r', - 'hid_act', 'had_act', - 'dropout_rate', - 'use_norm', 'norm_layer', 'norm_before') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "in_channels", + "in_conv_channels", + "in_kernel_size", + "in_stride", + "resb_type", + "resb_repeats", + "resb_channels", + "resb_kernel_sizes", + "resb_strides", + "resb_dilations", + "resb_groups", + "head_channels", + "se_r", + "hid_act", + "had_act", + "dropout_rate", + "use_norm", + "norm_layer", + "norm_before", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args - - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--in-channels', type=int, default=80, - help=('input channels of decoder')) + "--in-channels", type=int, default=80, help=("input channels of decoder") + ) parser.add_argument( - '--in-conv-channels', default=128, type=int, - help=('number of output channels in input convolution')) + "--in-conv-channels", + default=128, + type=int, + help=("number of output channels in input convolution"), + ) parser.add_argument( - '--in-kernel-size', default=3, type=int, - help=('kernel size of input convolution')) + "--in-kernel-size", + default=3, + type=int, + help=("kernel size of input convolution"), + ) - parser.add_argument('--in-stride', default=1, type=int, - help=('stride of input convolution')) + parser.add_argument( + "--in-stride", default=1, type=int, help=("stride of input convolution") + ) parser.add_argument( - '--resb-type', default='basic', - choices=['basic', 'bn', 'sebasic', 'sebn'], help=('residual blocks type')) + "--resb-type", + default="basic", + choices=["basic", "bn", "sebasic", "sebn"], + help=("residual blocks type"), + ) parser.add_argument( - '--resb-repeats', default=[1, 1, 1], type=int, - nargs='+', help=('resb-blocks repeats in each encoder stage')) + "--resb-repeats", + default=[1, 1, 1], + type=int, + nargs="+", + help=("resb-blocks repeats in each encoder stage"), + ) parser.add_argument( - '--resb-channels', default=[128, 64, 32], - type=int, nargs='+', - help=('resb-blocks channels for each stage')) + "--resb-channels", + default=[128, 64, 32], + type=int, + nargs="+", + help=("resb-blocks channels for each stage"), + ) parser.add_argument( - '--resb-kernel-sizes', default=3, - nargs='+', type=int, help=('resb-blocks kernels for each encoder stage')) + "--resb-kernel-sizes", + default=3, + nargs="+", + type=int, + help=("resb-blocks kernels for each encoder stage"), + ) parser.add_argument( - '--resb-strides', default=2, - nargs='+', type=int, help=('resb-blocks strides for each encoder stage')) + "--resb-strides", + default=2, + nargs="+", + type=int, + help=("resb-blocks strides for each encoder stage"), + ) parser.add_argument( - '--resb-dilations', default=[1], - nargs='+', type=int, help=('resb-blocks dilations for each encoder stage')) + "--resb-dilations", + default=[1], + nargs="+", + type=int, + help=("resb-blocks dilations for each encoder stage"), + ) parser.add_argument( - '--resb-groups', default=1, - type=int, help=('resb-blocks groups in convolutions')) + "--resb-groups", + default=1, + type=int, + help=("resb-blocks groups in convolutions"), + ) parser.add_argument( - '--head-channels', default=0, type=int, - help=('channels in the last conv block of encoder')) + "--head-channels", + default=0, + type=int, + help=("channels in the last conv block of encoder"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - - parser.add_argument('--head-act', default=None, - help='activation in encoder head') - + + parser.add_argument( + "--head-act", default=None, help="activation in encoder head" + ) + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout probability') + parser.add_argument( + "--dropout-rate", default=0, type=float, help="dropout probability" + ) except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') - - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) parser.add_argument( - '--se-r', default=16, type=int, - help=('squeeze-excitation compression ratio')) + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) - if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='ResNet2d decoder options') + parser.add_argument( + "--se-r", + default=16, + type=int, + help=("squeeze-excitation compression ratio"), + ) + if prefix is not None: + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='ResNet2d decoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/spinenet.py b/hyperion/torch/narchs/spinenet.py index 0ec33e94..4185d9c4 100644 --- a/hyperion/torch/narchs/spinenet.py +++ b/hyperion/torch/narchs/spinenet.py @@ -92,43 +92,42 @@ class SpineNet(NetArch): - def __init__(self, - in_channels, - block_specs=None, - output_levels=[3, 4, 5, 6, 7], - endpoints_num_filters=256, - resample_alpha=0.5, - feature_output_level=None, - block_repeats=1, - filter_size_scale=1.0, - conv_channels=64, - base_channels=64, - out_units=0, - concat=False, - do_endpoint_conv=True, - concat_ax=3, - upsampling_type='nearest', - hid_act={ - 'name': 'relu6', - 'inplace': True - }, - out_act=None, - in_kernel_size=7, - in_stride=2, - zero_init_residual=False, - groups=1, - dropout_rate=0, - norm_layer=None, - norm_before=True, - do_maxpool=True, - in_norm=True, - in_feats=None, - se_r=16, - time_se=False, - has_se=False, - is_res2net=False, - res2net_scale=4, - res2net_width_factor=1): + def __init__( + self, + in_channels, + block_specs=None, + output_levels=[3, 4, 5, 6, 7], + endpoints_num_filters=256, + resample_alpha=0.5, + feature_output_level=None, + block_repeats=1, + filter_size_scale=1.0, + conv_channels=64, + base_channels=64, + out_units=0, + concat=False, + do_endpoint_conv=True, + concat_ax=3, + upsampling_type="nearest", + hid_act={"name": "relu6", "inplace": True}, + out_act=None, + in_kernel_size=7, + in_stride=2, + zero_init_residual=False, + groups=1, + dropout_rate=0, + norm_layer=None, + norm_before=True, + do_maxpool=True, + in_norm=True, + in_feats=None, + se_r=16, + time_se=False, + has_se=False, + is_res2net=False, + res2net_scale=4, + res2net_width_factor=1, + ): """ Base class for the SpineNet structure. Based on the paper SpineNet: Learning Scale-Permuted Backbone for Recognition and Localization @@ -162,9 +161,9 @@ def __init__(self, self.concat = concat self.concat_ax = concat_ax self.do_endpoint_conv = do_endpoint_conv - self.feature_output_level = min( - output_levels - ) if feature_output_level is None else feature_output_level + self.feature_output_level = ( + min(output_levels) if feature_output_level is None else feature_output_level + ) self.res2net_scale = res2net_scale self.res2net_width_factor = res2net_width_factor @@ -174,8 +173,11 @@ def __init__(self, self.time_se = time_se self.has_se = has_se - self._block_specs = BlockSpec.build_block_specs(SPINENET_BLOCK_SPECS) \ - if block_specs is None else BlockSpec.build_block_specs(block_specs) + self._block_specs = ( + BlockSpec.build_block_specs(SPINENET_BLOCK_SPECS) + if block_specs is None + else BlockSpec.build_block_specs(block_specs) + ) self.output_levels = output_levels self.upsampling_type = upsampling_type self.dilation = 1 @@ -192,7 +194,7 @@ def __init__(self, self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': + if norm_layer == "group-norm": norm_groups = min(base_channels // 2, 32) norm_groups = max(norm_groups, groups) self._norm_layer = NLF.create(norm_layer, norm_groups) @@ -200,14 +202,16 @@ def __init__(self, if in_norm: self.in_bn = norm_layer(in_channels) - self.in_block = ResNetInputBlock(in_channels, - conv_channels, - kernel_size=in_kernel_size, - stride=in_stride, - activation=hid_act, - norm_layer=self._norm_layer, - norm_before=norm_before, - do_maxpool=do_maxpool) + self.in_block = ResNetInputBlock( + in_channels, + conv_channels, + kernel_size=in_kernel_size, + stride=in_stride, + activation=hid_act, + norm_layer=self._norm_layer, + norm_before=norm_before, + do_maxpool=do_maxpool, + ) if self.is_res2net: if self._block_specs[0].block_fn == ResNetBNBlock: @@ -217,22 +221,22 @@ def __init__(self, else: _in_block = self._block_specs[0].block_fn - self.stem0 = self._make_layer(_in_block, - 2, - self.block_repeats, - in_channels=conv_channels) + self.stem0 = self._make_layer( + _in_block, 2, self.block_repeats, in_channels=conv_channels + ) self.stem1 = self._make_layer(_in_block, 2, self.block_repeats) self.stem_nbr = 2 # the number of the stem layers - self.blocks = self._make_permuted_blocks( - self._block_specs[self.stem_nbr:]) + self.blocks = self._make_permuted_blocks(self._block_specs[self.stem_nbr :]) self.connections = self._make_permuted_connections( - self._block_specs[self.stem_nbr:]) + self._block_specs[self.stem_nbr :] + ) self.endpoints = self._make_endpoints() self._context = self._compute_max_context(self.in_block.context) - self._downsample_factor = self.in_block.downsample_factor * 2**( - self.feature_output_level - 2) + self._downsample_factor = self.in_block.downsample_factor * 2 ** ( + self.feature_output_level - 2 + ) self.with_output = False self.out_act = None if out_units > 0: @@ -244,21 +248,21 @@ def __init__(self, for m in self.modules(): if isinstance(m, nn.Conv2d): - act_name = 'relu' + act_name = "relu" if isinstance(hid_act, str): act_name = hid_act if isinstance(hid_act, dict): - act_name = hid_act['name'] - if act_name == 'swish': - act_name = 'relu' + act_name = hid_act["name"] + if act_name == "swish": + act_name = "relu" try: - nn.init.kaiming_normal_(m.weight, - mode='fan_out', - nonlinearity=act_name) + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity=act_name + ) except: - nn.init.kaiming_normal_(m.weight, - mode='fan_out', - nonlinearity='relu') + nn.init.kaiming_normal_( + m.weight, mode="fan_out", nonlinearity="relu" + ) elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)): nn.init.constant_(m.weight, 1) nn.init.constant_(m.bias, 0) @@ -300,26 +304,37 @@ def _make_permuted_connections(self, block_specs): connections = nn.ModuleList([]) for block in block_specs: expansion = block.block_fn.expansion - out_channels = int( - FILTER_SIZE_MAP[block.level] * self.filter_size_scale * - self.base_channels) * expansion + out_channels = ( + int( + FILTER_SIZE_MAP[block.level] + * self.filter_size_scale + * self.base_channels + ) + * expansion + ) connections_i = nn.ModuleList([]) for i in block.input_offsets: offset_block = self._block_specs[i] scale = offset_block.level - block.level - in_channels = int(FILTER_SIZE_MAP[offset_block.level] * - self.filter_size_scale * self.base_channels) + in_channels = int( + FILTER_SIZE_MAP[offset_block.level] + * self.filter_size_scale + * self.base_channels + ) connections_i.append( - SpineResample(offset_block, - in_channels, - out_channels, - scale, - self.resample_alpha, - self.upsampling_type, - activation=self.hid_act, - norm_layer=self._norm_layer, - norm_before=self.norm_before)) + SpineResample( + offset_block, + in_channels, + out_channels, + scale, + self.resample_alpha, + self.upsampling_type, + activation=self.hid_act, + norm_layer=self._norm_layer, + norm_before=self.norm_before, + ) + ) connections_i.append(AF.create(self.hid_act)) connections.append(connections_i) return connections @@ -334,10 +349,17 @@ def _make_endpoints(self): for block_spec in self._block_specs: if block_spec.is_output and block_spec.level in self.output_levels: expansion = block_spec.block_fn.expansion - in_channels = int( - FILTER_SIZE_MAP[block_spec.level] * - self.filter_size_scale * self.base_channels) * expansion - out_channels = self.endpoints_num_filters if self.do_endpoint_conv else in_channels + in_channels = ( + int( + FILTER_SIZE_MAP[block_spec.level] + * self.filter_size_scale + * self.base_channels + ) + * expansion + ) + out_channels = ( + self.endpoints_num_filters if self.do_endpoint_conv else in_channels + ) endpoints[str(block_spec.level)] = SpineEndpoints( in_channels, out_channels, @@ -347,17 +369,14 @@ def _make_endpoints(self): activation=self.hid_act, norm_layer=self._norm_layer, norm_before=self.norm_before, - do_endpoint_conv=self.do_endpoint_conv) + do_endpoint_conv=self.do_endpoint_conv, + ) return endpoints - def _make_layer(self, - block, - block_level, - num_blocks, - in_channels=None, - stride=1, - dilate=False): + def _make_layer( + self, block, block_level, num_blocks, in_channels=None, stride=1, dilate=False + ): previous_dilation = self.dilation if dilate: @@ -367,53 +386,57 @@ def _make_layer(self, kwargs = {} if self.has_se: if self.time_se: - num_feats = int(self.in_feats / - self.in_block.downsample_factor) + num_feats = int(self.in_feats / self.in_block.downsample_factor) for i in range(block_level - 2): - num_feats = int( - num_feats // - 2) if num_feats % 2 == 0 else int(num_feats // 2 + 1) - kwargs = { - 'se_r': self.se_r, - 'time_se': True, - 'num_feats': num_feats - } + num_feats = ( + int(num_feats // 2) + if num_feats % 2 == 0 + else int(num_feats // 2 + 1) + ) + kwargs = {"se_r": self.se_r, "time_se": True, "num_feats": num_feats} else: - kwargs = {'se_r': self.se_r} + kwargs = {"se_r": self.se_r} if self.is_res2net and block != ResNetBasicBlock: - kwargs['scale'] = self.res2net_scale - kwargs['width_factor'] = self.res2net_width_factor - channels = int(FILTER_SIZE_MAP[block_level] * self.base_channels * - self.filter_size_scale) + kwargs["scale"] = self.res2net_scale + kwargs["width_factor"] = self.res2net_width_factor + channels = int( + FILTER_SIZE_MAP[block_level] * self.base_channels * self.filter_size_scale + ) if in_channels is None: in_channels = channels * block.expansion layers = [] layers.append( - block(in_channels, - channels, - activation=self.hid_act, - stride=stride, - dropout_rate=self.dropout_rate, - groups=self.groups, - dilation=previous_dilation, - norm_layer=self._norm_layer, - norm_before=self.norm_before, - **kwargs)) + block( + in_channels, + channels, + activation=self.hid_act, + stride=stride, + dropout_rate=self.dropout_rate, + groups=self.groups, + dilation=previous_dilation, + norm_layer=self._norm_layer, + norm_before=self.norm_before, + **kwargs + ) + ) cur_in_channels = channels * block.expansion for _ in range(1, num_blocks): layers.append( - block(cur_in_channels, - channels, - activation=self.hid_act, - dropout_rate=self.dropout_rate, - groups=self.groups, - dilation=self.dilation, - norm_layer=self._norm_layer, - norm_before=self.norm_before, - **kwargs)) + block( + cur_in_channels, + channels, + activation=self.hid_act, + dropout_rate=self.dropout_rate, + groups=self.groups, + dilation=self.dilation, + norm_layer=self._norm_layer, + norm_before=self.norm_before, + **kwargs + ) + ) return nn.Sequential(*layers) @@ -429,26 +452,37 @@ def _compute_max_context(self, in_context): base_downsample_factor = self.in_block.downsample_factor context0 = in_context # context of the first two blocks (stem part) - context0 += base_downsample_factor * block_context[ - self._block_specs[0].block_fn] * self.block_repeats - context1 = context0 + base_downsample_factor * block_context[ - self._block_specs[1].block_fn] * self.block_repeats + context0 += ( + base_downsample_factor + * block_context[self._block_specs[0].block_fn] + * self.block_repeats + ) + context1 = ( + context0 + + base_downsample_factor + * block_context[self._block_specs[1].block_fn] + * self.block_repeats + ) contexts = [context0, context1] # context in the scale permuted part num_outgoing_connections = [0, 0] - for idx, block in enumerate(self._block_specs[self.stem_nbr:]): + for idx, block in enumerate(self._block_specs[self.stem_nbr :]): input0 = block.input_offsets[0] input1 = block.input_offsets[1] target_level = block.level # we add context if in the resampling connection was downsampling operation (it includes 3x3 convolution) - resample0 = self._block_specs[ - input0].level + 1 if self._block_specs[ - input0].level - target_level < 0 else 0 - resample1 = self._block_specs[ - input1].level + 1 if self._block_specs[ - input1].level - target_level < 0 else 0 + resample0 = ( + self._block_specs[input0].level + 1 + if self._block_specs[input0].level - target_level < 0 + else 0 + ) + resample1 = ( + self._block_specs[input1].level + 1 + if self._block_specs[input1].level - target_level < 0 + else 0 + ) parent0_context = contexts[input0] + resample0 parent1_context = contexts[input1] + resample1 # as input context we choose the input with higher value @@ -460,14 +494,17 @@ def _compute_max_context(self, in_context): # Some blocks have also this additional connection if block.is_output: for j, j_connections in enumerate(num_outgoing_connections): - if j_connections == 0 and self._block_specs[ - j].level == target_level: + if ( + j_connections == 0 + and self._block_specs[j].level == target_level + ): target_context = max(contexts[j], target_context) num_outgoing_connections[j] += 1 - downsample_factor = base_downsample_factor * 2**(target_level - 2) - target_context += block_context[ - block.block_fn] * self.block_repeats * downsample_factor + downsample_factor = base_downsample_factor * 2 ** (target_level - 2) + target_context += ( + block_context[block.block_fn] * self.block_repeats * downsample_factor + ) contexts.append(target_context) num_outgoing_connections.append(0) # logging.info('block\'s contexts: {}'.format(contexts)) @@ -490,8 +527,9 @@ def _compute_out_size(self, in_size): downsample_levels = self.feature_output_level - 2 for i in range(downsample_levels): - out_size = int(out_size // - 2) if out_size % 2 == 0 else int(out_size // 2 + 1) + out_size = ( + int(out_size // 2) if out_size % 2 == 0 else int(out_size // 2 + 1) + ) return out_size @@ -529,12 +567,12 @@ def in_shape(self): def out_shape(self, in_shape=None): """Computes the output shape given the input shape - # - # Args: - # in_shape: input shape - # Returns: - # Tuple describing output shape for the network - # """ + # + # Args: + # in_shape: input shape + # Returns: + # Tuple describing output shape for the network + #""" if self.with_output: return (None, self.out_units) @@ -615,14 +653,15 @@ def _forward(self, x): output_feats = {} num_outgoing_connections = [0, 0] - for idx, block in enumerate(self._block_specs[self.stem_nbr:]): + for idx, block in enumerate(self._block_specs[self.stem_nbr :]): input0 = block.input_offsets[0] input1 = block.input_offsets[1] parent0_feat = self.connections[idx][0](feats[input0]) parent1_feat = self.connections[idx][1](feats[input1]) parent0_feat, parent1_feat = self._match_feat_shape( - parent0_feat, parent1_feat) + parent0_feat, parent1_feat + ) target_feat = parent0_feat + parent1_feat num_outgoing_connections[input0] += 1 @@ -630,13 +669,15 @@ def _forward(self, x): # Connect intermediate blocks with outdegree 0 to the output block. if block.is_output: for j, (j_feat, j_connections) in enumerate( - zip(feats, num_outgoing_connections)): + zip(feats, num_outgoing_connections) + ): if j_connections == 0 and j_feat.shape == target_feat.shape: target_feat += j_feat num_outgoing_connections[j] += 1 target_feat = self.connections[idx][2]( - target_feat) # pass input through the activation function + target_feat + ) # pass input through the activation function x = self.blocks[idx](target_feat) feats.append(x) @@ -644,13 +685,14 @@ def _forward(self, x): if block.is_output and block.level in self.output_levels: if str(block.level) in output_feats: raise ValueError( - 'Duplicate feats found for output level {}.'.format( - block.level)) + "Duplicate feats found for output level {}.".format(block.level) + ) output_feats[str(block.level)] = x output_endpoints = [] - output_shape = list(output_feats[str( - self.feature_output_level)].size()) # get the target output size + output_shape = list( + output_feats[str(self.feature_output_level)].size() + ) # get the target output size for endpoint in self.endpoints: if self.endpoints[endpoint] is not None: @@ -674,7 +716,7 @@ def _forward(self, x): return x def get_config(self): - """ Gets network config + """Gets network config Returns: dictionary with config params """ @@ -683,35 +725,35 @@ def get_config(self): hid_act = self.hid_act config = { - 'in_channels': self.in_channels, - 'in_kernel_size': self.in_kernel_size, - 'in_stride': self.in_stride, - 'conv_channels': self.conv_channels, - 'base_channels': self.base_channels, - 'endpoints_num_filters': self.endpoints_num_filters, - 'resample_alpha': self.resample_alpha, - 'block_repeats': self.block_repeats, - 'filter_size_scale': self.filter_size_scale, - 'output_levels': self.output_levels, - 'feature_output_level': self.feature_output_level, - 'out_units': self.out_units, - 'concat': self.concat, - 'concat_ax': self.concat_ax, - 'do_endpoint_conv': self.do_endpoint_conv, - 'upsampling_type': self.upsampling_type, - 'zero_init_residual': self.zero_init_residual, - 'groups': self.groups, - 'dropout_rate': self.dropout_rate, - 'norm_layer': self.norm_layer, - 'norm_before': self.norm_before, - 'in_norm': self.in_norm, - 'do_maxpool': self.do_maxpool, - 'out_act': out_act, - 'hid_act': hid_act, - 'se_r': self.se_r, - 'in_feats': self.in_feats, - 'res2net_scale': self.res2net_scale, - 'res2net_width_factor': self.res2net_width_factor + "in_channels": self.in_channels, + "in_kernel_size": self.in_kernel_size, + "in_stride": self.in_stride, + "conv_channels": self.conv_channels, + "base_channels": self.base_channels, + "endpoints_num_filters": self.endpoints_num_filters, + "resample_alpha": self.resample_alpha, + "block_repeats": self.block_repeats, + "filter_size_scale": self.filter_size_scale, + "output_levels": self.output_levels, + "feature_output_level": self.feature_output_level, + "out_units": self.out_units, + "concat": self.concat, + "concat_ax": self.concat_ax, + "do_endpoint_conv": self.do_endpoint_conv, + "upsampling_type": self.upsampling_type, + "zero_init_residual": self.zero_init_residual, + "groups": self.groups, + "dropout_rate": self.dropout_rate, + "norm_layer": self.norm_layer, + "norm_before": self.norm_before, + "in_norm": self.in_norm, + "do_maxpool": self.do_maxpool, + "out_act": out_act, + "hid_act": hid_act, + "se_r": self.se_r, + "in_feats": self.in_feats, + "res2net_scale": self.res2net_scale, + "res2net_width_factor": self.res2net_width_factor, } base_config = super().get_config() @@ -721,46 +763,46 @@ def get_config(self): # SpineNet structures from the original paper class SpineNet49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 256 - kwargs['filter_size_scale'] = 1.0 - kwargs['resample_alpha'] = 0.5 - kwargs['block_repeats'] = 1 + kwargs["endpoints_num_filters"] = 256 + kwargs["filter_size_scale"] = 1.0 + kwargs["resample_alpha"] = 0.5 + kwargs["block_repeats"] = 1 super(SpineNet49, self).__init__(in_channels, **kwargs) class SpineNet49S(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 128 - kwargs['filter_size_scale'] = 0.66 - kwargs['resample_alpha'] = 0.5 - kwargs['block_repeats'] = 1 + kwargs["endpoints_num_filters"] = 128 + kwargs["filter_size_scale"] = 0.66 + kwargs["resample_alpha"] = 0.5 + kwargs["block_repeats"] = 1 super(SpineNet49S, self).__init__(in_channels, **kwargs) class SpineNet96(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 256 - kwargs['filter_size_scale'] = 1.0 - kwargs['resample_alpha'] = 0.5 - kwargs['block_repeats'] = 2 + kwargs["endpoints_num_filters"] = 256 + kwargs["filter_size_scale"] = 1.0 + kwargs["resample_alpha"] = 0.5 + kwargs["block_repeats"] = 2 super(SpineNet96, self).__init__(in_channels, **kwargs) class SpineNet143(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 256 - kwargs['filter_size_scale'] = 1.0 - kwargs['resample_alpha'] = 1.0 - kwargs['block_repeats'] = 3 + kwargs["endpoints_num_filters"] = 256 + kwargs["filter_size_scale"] = 1.0 + kwargs["resample_alpha"] = 1.0 + kwargs["block_repeats"] = 3 super(SpineNet143, self).__init__(in_channels, **kwargs) class SpineNet190(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 512 - kwargs['filter_size_scale'] = 1.3 - kwargs['resample_alpha'] = 1.0 - kwargs['block_repeats'] = 4 + kwargs["endpoints_num_filters"] = 512 + kwargs["filter_size_scale"] = 1.3 + kwargs["resample_alpha"] = 1.0 + kwargs["block_repeats"] = 4 super(SpineNet190, self).__init__(in_channels, **kwargs) @@ -768,138 +810,138 @@ def __init__(self, in_channels, **kwargs): # Light SpineNets class LSpineNet49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 super(LSpineNet49, self).__init__(in_channels, **kwargs) class LSpineNet49_subpixel(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['upsampling_type'] = 'subpixel' + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["upsampling_type"] = "subpixel" super(LSpineNet49_subpixel, self).__init__(in_channels, **kwargs) class LSpineNet49_bilinear(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['upsampling_type'] = 'bilinear' + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["upsampling_type"] = "bilinear" super(LSpineNet49_bilinear, self).__init__(in_channels, **kwargs) class LSpineNet49_5(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['output_levels'] = [5] - kwargs['do_endpoint_conv'] = False - kwargs['block_specs'] = SPINENET_BLOCK_SPECS_5 + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["output_levels"] = [5] + kwargs["do_endpoint_conv"] = False + kwargs["block_specs"] = SPINENET_BLOCK_SPECS_5 super(LSpineNet49_5, self).__init__(in_channels, **kwargs) class LSpine2Net49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['is_res2net'] = True + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["is_res2net"] = True super(LSpine2Net49, self).__init__(in_channels, **kwargs) # Spine2Nets ans(Time-)Squeeze-and-Excitation class SELSpine2Net49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['is_res2net'] = True - kwargs['has_se'] = True + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["is_res2net"] = True + kwargs["has_se"] = True super(SELSpine2Net49, self).__init__(in_channels, **kwargs) class TSELSpine2Net49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['is_res2net'] = True - kwargs['has_se'] = True - kwargs['time_se'] = True + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["is_res2net"] = True + kwargs["has_se"] = True + kwargs["time_se"] = True super(TSELSpine2Net49, self).__init__(in_channels, **kwargs) class Spine2Net49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['is_res2net'] = True + kwargs["is_res2net"] = True super(Spine2Net49, self).__init__(in_channels, **kwargs) class SESpine2Net49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['is_res2net'] = True - kwargs['has_se'] = True + kwargs["is_res2net"] = True + kwargs["has_se"] = True super(SESpine2Net49, self).__init__(in_channels, **kwargs) class TSESpine2Net49(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['is_res2net'] = True - kwargs['has_se'] = True - kwargs['time_se'] = True + kwargs["is_res2net"] = True + kwargs["has_se"] = True + kwargs["time_se"] = True super(TSESpine2Net49, self).__init__(in_channels, **kwargs) class Spine2Net49S(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 128 - kwargs['filter_size_scale'] = 0.66 - kwargs['is_res2net'] = True + kwargs["endpoints_num_filters"] = 128 + kwargs["filter_size_scale"] = 0.66 + kwargs["is_res2net"] = True super(Spine2Net49S, self).__init__(in_channels, **kwargs) class SESpine2Net49S(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 128 - kwargs['filter_size_scale'] = 0.66 - kwargs['is_res2net'] = True - kwargs['has_se'] = True + kwargs["endpoints_num_filters"] = 128 + kwargs["filter_size_scale"] = 0.66 + kwargs["is_res2net"] = True + kwargs["has_se"] = True super(SESpine2Net49S, self).__init__(in_channels, **kwargs) class TSESpine2Net49S(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 128 - kwargs['filter_size_scale'] = 0.66 - kwargs['is_res2net'] = True - kwargs['has_se'] = True - kwargs['time_se'] = True + kwargs["endpoints_num_filters"] = 128 + kwargs["filter_size_scale"] = 0.66 + kwargs["is_res2net"] = True + kwargs["has_se"] = True + kwargs["time_se"] = True super(TSESpine2Net49S, self).__init__(in_channels, **kwargs) # R0-SP53 (structure from the paper) class LR0_SP53(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['endpoints_num_filters'] = 64 - kwargs['conv_channels'] = 16 - kwargs['base_channels'] = 16 - kwargs['block_specs'] = R0_SP53_BLOCK_SPECS + kwargs["endpoints_num_filters"] = 64 + kwargs["conv_channels"] = 16 + kwargs["base_channels"] = 16 + kwargs["block_specs"] = R0_SP53_BLOCK_SPECS super(LR0_SP53, self).__init__(in_channels, **kwargs) class R0_SP53(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['block_specs'] = R0_SP53_BLOCK_SPECS + kwargs["block_specs"] = R0_SP53_BLOCK_SPECS super(R0_SP53, self).__init__(in_channels, **kwargs) # concatenation class SpineNet49_concat_time(SpineNet): def __init__(self, in_channels, **kwargs): - kwargs['concat'] = True + kwargs["concat"] = True super(SpineNet49_concat_time, self).__init__(in_channels, **kwargs) diff --git a/hyperion/torch/narchs/spinenet_factory.py b/hyperion/torch/narchs/spinenet_factory.py index 5a82d374..02e36244 100644 --- a/hyperion/torch/narchs/spinenet_factory.py +++ b/hyperion/torch/narchs/spinenet_factory.py @@ -7,111 +7,130 @@ from .spinenet import * spinenet_dict = { - 'spinenet49': SpineNet49, - 'spinenet49s': SpineNet49S, - 'spinenet96': SpineNet96, - 'spinenet143': SpineNet143, - 'spinenet190': SpineNet190, - 'lspinenet49': LSpineNet49, - 'lspinenet49_subpixel': LSpineNet49_subpixel, - 'lspinenet49_bilinear': LSpineNet49_bilinear, - 'lspinenet49_5': LSpineNet49_5, - 'lspine2net49': LSpine2Net49, - 'selspine2net49': SELSpine2Net49, - 'tselspine2net49': TSELSpine2Net49, - 'spine2net49': Spine2Net49, - 'sespine2net49': SESpine2Net49, - 'tsespine2net49': TSESpine2Net49, - 'spine2net49s': Spine2Net49S, - 'sespine2net49s': SESpine2Net49S, - 'tsespine2net49s': TSESpine2Net49S, - 'lr0_sp53': LR0_SP53, - 'r0_sp53': R0_SP53, - 'spinenet49_concat_time': SpineNet49_concat_time, + "spinenet49": SpineNet49, + "spinenet49s": SpineNet49S, + "spinenet96": SpineNet96, + "spinenet143": SpineNet143, + "spinenet190": SpineNet190, + "lspinenet49": LSpineNet49, + "lspinenet49_subpixel": LSpineNet49_subpixel, + "lspinenet49_bilinear": LSpineNet49_bilinear, + "lspinenet49_5": LSpineNet49_5, + "lspine2net49": LSpine2Net49, + "selspine2net49": SELSpine2Net49, + "tselspine2net49": TSELSpine2Net49, + "spine2net49": Spine2Net49, + "sespine2net49": SESpine2Net49, + "tsespine2net49": TSESpine2Net49, + "spine2net49s": Spine2Net49S, + "sespine2net49s": SESpine2Net49S, + "tsespine2net49s": TSESpine2Net49S, + "lr0_sp53": LR0_SP53, + "r0_sp53": R0_SP53, + "spinenet49_concat_time": SpineNet49_concat_time, } class SpineNetFactory(object): @staticmethod - def create(spinenet_type, - in_channels, - output_levels=[3, 4, 5, 6, 7], - endpoints_num_filters=256, - resample_alpha=0.5, - block_repeats=1, - filter_size_scale=1.0, - conv_channels=64, - base_channels=64, - out_units=0, - hid_act={ - 'name': 'relu6', - 'inplace': True - }, - out_act=None, - in_kernel_size=7, - in_stride=2, - zero_init_residual=False, - groups=1, - dropout_rate=0, - norm_layer=None, - norm_before=True, - do_maxpool=True, - in_norm=True, - se_r=16, - in_feats=None, - res2net_scale=4, - res2net_width_factor=1): + def create( + spinenet_type, + in_channels, + output_levels=[3, 4, 5, 6, 7], + endpoints_num_filters=256, + resample_alpha=0.5, + block_repeats=1, + filter_size_scale=1.0, + conv_channels=64, + base_channels=64, + out_units=0, + hid_act={"name": "relu6", "inplace": True}, + out_act=None, + in_kernel_size=7, + in_stride=2, + zero_init_residual=False, + groups=1, + dropout_rate=0, + norm_layer=None, + norm_before=True, + do_maxpool=True, + in_norm=True, + se_r=16, + in_feats=None, + res2net_scale=4, + res2net_width_factor=1, + ): try: spinenet_class = spinenet_dict[spinenet_type] except: - raise Exception('%s is not valid SpineNet network' % - (spinenet_type)) - - spinenet = spinenet_class(in_channels, - output_levels=output_levels, - endpoints_num_filters=endpoints_num_filters, - resample_alpha=resample_alpha, - block_repeats=block_repeats, - filter_size_scale=filter_size_scale, - conv_channels=conv_channels, - base_channels=base_channels, - out_units=out_units, - hid_act=hid_act, - out_act=out_act, - in_kernel_size=in_kernel_size, - in_stride=in_stride, - zero_init_residual=zero_init_residual, - groups=groups, - dropout_rate=dropout_rate, - norm_layer=norm_layer, - norm_before=norm_before, - do_maxpool=do_maxpool, - in_norm=in_norm, - se_r=se_r, - in_feats=in_feats, - res2net_scale=res2net_scale, - res2net_width_factor=res2net_width_factor) + raise Exception("%s is not valid SpineNet network" % (spinenet_type)) + + spinenet = spinenet_class( + in_channels, + output_levels=output_levels, + endpoints_num_filters=endpoints_num_filters, + resample_alpha=resample_alpha, + block_repeats=block_repeats, + filter_size_scale=filter_size_scale, + conv_channels=conv_channels, + base_channels=base_channels, + out_units=out_units, + hid_act=hid_act, + out_act=out_act, + in_kernel_size=in_kernel_size, + in_stride=in_stride, + zero_init_residual=zero_init_residual, + groups=groups, + dropout_rate=dropout_rate, + norm_layer=norm_layer, + norm_before=norm_before, + do_maxpool=do_maxpool, + in_norm=in_norm, + se_r=se_r, + in_feats=in_feats, + res2net_scale=res2net_scale, + res2net_width_factor=res2net_width_factor, + ) return spinenet def filter_args(**kwargs): - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - if 'no_maxpool' in kwargs: - kwargs['do_maxpool'] = not kwargs['no_maxpool'] - del kwargs['no_maxpool'] - - valid_args = ('spinenet_type', 'in_channels', 'ouput_levels', - 'endpoints_num_filters', 'resample_alpha', - 'block_repeats', 'filter_size_scale', 'conv_channels', - 'base_channels', 'out_units', 'hid_act', 'out_act', - 'in_kernel_size', 'in_stride', 'zero_init_residual', - 'groups', 'dropout_rate', 'in_norm', 'norm_layer', - 'norm_before', 'do_maxpool', 'se_r', 'res2net_scale', - 'res2net_width_factor', 'in_feats') + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + if "no_maxpool" in kwargs: + kwargs["do_maxpool"] = not kwargs["no_maxpool"] + del kwargs["no_maxpool"] + + valid_args = ( + "spinenet_type", + "in_channels", + "ouput_levels", + "endpoints_num_filters", + "resample_alpha", + "block_repeats", + "filter_size_scale", + "conv_channels", + "base_channels", + "out_units", + "hid_act", + "out_act", + "in_kernel_size", + "in_stride", + "zero_init_residual", + "groups", + "dropout_rate", + "in_norm", + "norm_layer", + "norm_before", + "do_maxpool", + "se_r", + "res2net_scale", + "res2net_width_factor", + "in_feats", + ) args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args @@ -120,119 +139,130 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") spinenet_types = spinenet_dict.keys() - parser.add_argument('--spinenet-type', - type=str.lower, - default='spinenet49', - choices=spinenet_types, - help=('SpineNet type')) + parser.add_argument( + "--spinenet-type", + type=str.lower, + default="spinenet49", + choices=spinenet_types, + help=("SpineNet type"), + ) - parser.add_argument('--in-channels', - default=1, - type=int, - help=('number of input channels')) + parser.add_argument( + "--in-channels", default=1, type=int, help=("number of input channels") + ) parser.add_argument( - '--conv-channels', + "--conv-channels", default=64, type=int, - help=('number of output channels in input convolution ')) + help=("number of output channels in input convolution "), + ) - parser.add_argument('--base-channels', - default=64, - type=int, - help=('base channels of first SpineNet block')) + parser.add_argument( + "--base-channels", + default=64, + type=int, + help=("base channels of first SpineNet block"), + ) - parser.add_argument('--in-kernel-size', - default=7, - type=int, - help=('kernel size of first convolution')) + parser.add_argument( + "--in-kernel-size", + default=7, + type=int, + help=("kernel size of first convolution"), + ) - parser.add_argument('--in-stride', - default=2, - type=int, - help=('stride of first convolution')) + parser.add_argument( + "--in-stride", default=2, type=int, help=("stride of first convolution") + ) parser.add_argument( - '--groups', + "--groups", default=1, type=int, - help=('number of groups in residual blocks convolutions')) + help=("number of groups in residual blocks convolutions"), + ) try: - parser.add_argument('--norm-layer', - default=None, - choices=[ - 'batch-norm', 'group-norm', - 'instance-norm', 'instance-norm-affine', - 'layer-norm' - ], - help='type of normalization layer') + parser.add_argument( + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--in-norm', - default=False, - action='store_true', - help='batch normalization at the input') + parser.add_argument( + "--in-norm", + default=False, + action="store_true", + help="batch normalization at the input", + ) parser.add_argument( - '--no-maxpool', + "--no-maxpool", default=False, - action='store_true', - help='don\'t do max pooling after first convolution') + action="store_true", + help="don't do max pooling after first convolution", + ) parser.add_argument( - '--zero-init-residual', + "--zero-init-residual", default=False, - action='store_true', - help='Zero-initialize the last BN in each residual branch') + action="store_true", + help="Zero-initialize the last BN in each residual branch", + ) parser.add_argument( - '--se-r', + "--se-r", default=16, type=int, - help=('squeeze ratio in squeeze-excitation blocks')) + help=("squeeze ratio in squeeze-excitation blocks"), + ) - parser.add_argument('--res2net-scale', - default=4, - type=int, - help=('scale parameter for res2net')) + parser.add_argument( + "--res2net-scale", default=4, type=int, help=("scale parameter for res2net") + ) parser.add_argument( - '--res2net-width-factor', + "--res2net-width-factor", default=1, type=float, - help=('multiplicative factor for the internal width of res2net')) + help=("multiplicative factor for the internal width of res2net"), + ) try: - parser.add_argument('--hid-act', - default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass try: - parser.add_argument('--norm-after', - default=False, - action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) except: pass try: - parser.add_argument('--dropout-rate', - default=0, - type=float, - help='dropout') + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") except: pass if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/tdnn.py b/hyperion/torch/narchs/tdnn.py index a765fbdb..6cdcbf85 100644 --- a/hyperion/torch/narchs/tdnn.py +++ b/hyperion/torch/narchs/tdnn.py @@ -16,14 +16,24 @@ class TDNNV1(NetArch): - - def __init__(self, num_blocks, - in_units, hid_units, out_units=0, - kernel_size=3, dilation=1, dilation_factor=1, - hid_act={'name':'relu', 'inplace':True}, out_act=None, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=True, in_norm=True, - pooling=None): + def __init__( + self, + num_blocks, + in_units, + hid_units, + out_units=0, + kernel_size=3, + dilation=1, + dilation_factor=1, + hid_act={"name": "relu", "inplace": True}, + out_act=None, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=True, + pooling=None, + ): super().__init__() @@ -55,26 +65,34 @@ def __init__(self, num_blocks, if isinstance(dilation, list): assert num_blocks == len(dilation) else: - dilation = [dilation_factor*i+dilation for i in range(num_blocks)] + dilation = [dilation_factor * i + dilation for i in range(num_blocks)] # past and future context - self._context = int(np.sum(np.array(dilation)*( - np.array(kernel_size)-1)/2)) + self._context = int( + np.sum(np.array(dilation) * (np.array(kernel_size) - 1) / 2) + ) self.norm_layer = norm_layer norm_groups = None - if norm_layer == 'group-norm': - norm_groups = min(np.min(hid_units)//2, 32) + if norm_layer == "group-norm": + norm_groups = min(np.min(hid_units) // 2, 32) self._norm_layer = NLF.create(norm_layer, norm_groups) blocks = [] for i in range(num_blocks): blocks.append( - TDNNBlock(units[i], units[i+1], - kernel_size=kernel_size[i], dilation=dilation[i], - activation=hid_act, dropout_rate=dropout_rate, - norm_layer=self._norm_layer, - use_norm=use_norm, norm_before=norm_before)) + TDNNBlock( + units[i], + units[i + 1], + kernel_size=kernel_size[i], + dilation=dilation[i], + activation=hid_act, + dropout_rate=dropout_rate, + norm_layer=self._norm_layer, + use_norm=use_norm, + norm_before=norm_before, + ) + ) self.blocks = nn.ModuleList(blocks) @@ -82,19 +100,17 @@ def __init__(self, num_blocks, if out_units == 0: self.out_act = None self.output = None - return + return self.with_output = True self.out_act = AF.create(out_act) self.output = Linear(units[-1], out_units) - @property def in_context(self): return (self._context, self._context) - - + def forward(self, x, use_amp=False): if use_amp: with torch.cuda.amp.autocast(): @@ -102,7 +118,6 @@ def forward(self, x, use_amp=False): return self._forward(x) - def _forward(self, x): for i in range(self.num_blocks): @@ -110,12 +125,12 @@ def _forward(self, x): if self.with_output: if self.pooling is not None: - if self.pooling == 'mean': + if self.pooling == "mean": x = torch.mean(x, dim=2) - elif self.pooling == 'max': + elif self.pooling == "max": x = torch.max(x, dim=2) else: - raise Exception('pooling=%s not implemented' % (self.pooling)) + raise Exception("pooling=%s not implemented" % (self.pooling)) else: x = torch.transpose(x, 1, 2) @@ -125,37 +140,35 @@ def _forward(self, x): return x - def get_config(self): - + out_act = AF.get_config(self.out_act) - hid_act = AF.get_config(self.blocks[0].activation) - - config = {'num_blocks': self.num_blocks, - 'in_units': self.in_units, - 'hid_units': self.hid_units, - 'out_units': self.out_units, - 'kernel_size': self.kernel_size, - 'dilation': self.dilation, - 'dilation_factor': self.dilation_factor, - 'dropout_rate': self.dropout_rate, - 'norm_layer': self.norm_layer, - 'use_norm': self.use_norm, - 'norm_before': self.norm_before, - 'in_norm' : self.in_norm, - 'out_act': out_act, - 'hid_act': hid_act, - 'pooling': self.pooling } - + hid_act = AF.get_config(self.blocks[0].activation) + + config = { + "num_blocks": self.num_blocks, + "in_units": self.in_units, + "hid_units": self.hid_units, + "out_units": self.out_units, + "kernel_size": self.kernel_size, + "dilation": self.dilation, + "dilation_factor": self.dilation_factor, + "dropout_rate": self.dropout_rate, + "norm_layer": self.norm_layer, + "use_norm": self.use_norm, + "norm_before": self.norm_before, + "in_norm": self.in_norm, + "out_act": out_act, + "hid_act": hid_act, + "pooling": self.pooling, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - def in_shape(self): return (None, self.in_units, None) - def out_shape(self, in_shape=None): if self.with_output: return (None, self.out_units) @@ -170,6 +183,3 @@ def out_shape(self, in_shape=None): assert len(in_shape) == 3 return (in_shape[0], out_units, in_shape[2]) - - - diff --git a/hyperion/torch/narchs/tdnn_factory.py b/hyperion/torch/narchs/tdnn_factory.py index faa7e15b..584e9243 100644 --- a/hyperion/torch/narchs/tdnn_factory.py +++ b/hyperion/torch/narchs/tdnn_factory.py @@ -9,165 +9,258 @@ from .etdnn import ETDNNV1 from .resetdnn import ResETDNNV1 -class TDNNFactory(object): +class TDNNFactory(object): @staticmethod - def create(tdnn_type, num_enc_blocks, - in_feats, enc_hid_units, enc_expand_units=None, - kernel_size=3, dilation=1, dilation_factor=1, - hid_act={'name':'relu6', 'inplace':True}, - out_units=0, out_act=None, - dropout_rate=0, - norm_layer=None, use_norm=True, norm_before=True, in_norm=True): + def create( + tdnn_type, + num_enc_blocks, + in_feats, + enc_hid_units, + enc_expand_units=None, + kernel_size=3, + dilation=1, + dilation_factor=1, + hid_act={"name": "relu6", "inplace": True}, + out_units=0, + out_act=None, + dropout_rate=0, + norm_layer=None, + use_norm=True, + norm_before=True, + in_norm=True, + ): if enc_expand_units is not None and isinstance(enc_hid_units, int): - if tdnn_type != 'resetdnn' : - enc_hid_units = (num_enc_blocks - 1)*[enc_hid_units] + [enc_expand_units] + if tdnn_type != "resetdnn": + enc_hid_units = (num_enc_blocks - 1) * [enc_hid_units] + [ + enc_expand_units + ] - if tdnn_type == 'tdnn': + if tdnn_type == "tdnn": nnet = TDNNV1( - num_enc_blocks, in_feats, enc_hid_units, out_units=out_units, - kernel_size=kernel_size, - dilation=dilation, dilation_factor=dilation_factor, - hid_act=hid_act, out_act=out_act, dropout_rate=dropout_rate, - norm_layer=norm_layer, use_norm=use_norm, - norm_before=norm_before, in_norm=in_norm) - elif tdnn_type == 'etdnn': + num_enc_blocks, + in_feats, + enc_hid_units, + out_units=out_units, + kernel_size=kernel_size, + dilation=dilation, + dilation_factor=dilation_factor, + hid_act=hid_act, + out_act=out_act, + dropout_rate=dropout_rate, + norm_layer=norm_layer, + use_norm=use_norm, + norm_before=norm_before, + in_norm=in_norm, + ) + elif tdnn_type == "etdnn": nnet = ETDNNV1( - num_enc_blocks, in_feats, enc_hid_units, out_units=out_units, - kernel_size=kernel_size, - dilation=dilation, dilation_factor=dilation_factor, - hid_act=hid_act,out_act=out_act, dropout_rate=dropout_rate, - norm_layer=norm_layer, use_norm=use_norm, - norm_before=norm_before, in_norm=in_norm) - elif tdnn_type == 'resetdnn': + num_enc_blocks, + in_feats, + enc_hid_units, + out_units=out_units, + kernel_size=kernel_size, + dilation=dilation, + dilation_factor=dilation_factor, + hid_act=hid_act, + out_act=out_act, + dropout_rate=dropout_rate, + norm_layer=norm_layer, + use_norm=use_norm, + norm_before=norm_before, + in_norm=in_norm, + ) + elif tdnn_type == "resetdnn": if enc_expand_units is None: enc_expand_units = enc_hid_units nnet = ResETDNNV1( - num_enc_blocks, in_feats, enc_hid_units, enc_expand_units, - out_units=out_units, kernel_size=kernel_size, - dilation=dilation, dilation_factor=dilation_factor, - hid_act=hid_act, out_act=out_act, dropout_rate=dropout_rate, - norm_layer=norm_layer, use_norm=use_norm, - norm_before=norm_before, in_norm=in_norm) + num_enc_blocks, + in_feats, + enc_hid_units, + enc_expand_units, + out_units=out_units, + kernel_size=kernel_size, + dilation=dilation, + dilation_factor=dilation_factor, + hid_act=hid_act, + out_act=out_act, + dropout_rate=dropout_rate, + norm_layer=norm_layer, + use_norm=use_norm, + norm_before=norm_before, + in_norm=in_norm, + ) else: - raise Exception('%s is not valid TDNN network' % (tdnn_type)) + raise Exception("%s is not valid TDNN network" % (tdnn_type)) return nnet - - def filter_args(**kwargs): - if 'wo_norm' in kwargs: - kwargs['use_norm'] = not kwargs['wo_norm'] - del kwargs['wo_norm'] - - if 'norm_after' in kwargs: - kwargs['norm_before'] = not kwargs['norm_after'] - del kwargs['norm_after'] - - valid_args = ('tdnn_type', 'num_enc_blocks', - 'enc_hid_units', 'enc_expand_units', 'kernel_size', - 'dilation', 'dilation_factor', 'in_norm', 'hid_act', - 'norm_layer', 'use_norm', 'norm_before', 'in_feats', 'dropout_rate') - - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - for arg in ('enc_hid_units', 'kernel_size', 'dilation'): + if "wo_norm" in kwargs: + kwargs["use_norm"] = not kwargs["wo_norm"] + del kwargs["wo_norm"] + + if "norm_after" in kwargs: + kwargs["norm_before"] = not kwargs["norm_after"] + del kwargs["norm_after"] + + valid_args = ( + "tdnn_type", + "num_enc_blocks", + "enc_hid_units", + "enc_expand_units", + "kernel_size", + "dilation", + "dilation_factor", + "in_norm", + "hid_act", + "norm_layer", + "use_norm", + "norm_before", + "in_feats", + "dropout_rate", + ) + + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) + + for arg in ("enc_hid_units", "kernel_size", "dilation"): if arg in args: - val = args[arg] - if isinstance(val, list) and len(val)==1: + val = args[arg] + if isinstance(val, list) and len(val) == 1: args[arg] = val[0] - - return args - + return args @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - - parser.add_argument('--tdnn-type', type=str.lower, - default='resetdnn', - choices=['tdnn','etdnn', 'resetdnn'], - help=('TDNN type: TDNN, ETDNN, ResETDNN')) - - parser.add_argument('--num-enc-blocks', - default=9, type=int, - help=('number of encoder layer blocks')) - - parser.add_argument('--enc-hid-units', nargs='+', - default=512, type=int, - help=('number of encoder layer blocks')) - - parser.add_argument('--enc-expand-units', - default=None, type=int, - help=('dimension of last layer of ResETDNN')) - - parser.add_argument('--kernel-size', nargs='+', - default=3, type=int, - help=('kernel sizes of encoder conv1d')) - - parser.add_argument('--dilation', nargs='+', - default=1, type=int, - help=('dilations of encoder conv1d')) - - parser.add_argument('--dilation-factor', - default=1, type=int, - help=('dilation increment wrt previous conv1d layer')) + parser = ArgumentParser(prog="") + + parser.add_argument( + "--tdnn-type", + type=str.lower, + default="resetdnn", + choices=["tdnn", "etdnn", "resetdnn"], + help=("TDNN type: TDNN, ETDNN, ResETDNN"), + ) + + parser.add_argument( + "--num-enc-blocks", + default=9, + type=int, + help=("number of encoder layer blocks"), + ) + + parser.add_argument( + "--enc-hid-units", + nargs="+", + default=512, + type=int, + help=("number of encoder layer blocks"), + ) + + parser.add_argument( + "--enc-expand-units", + default=None, + type=int, + help=("dimension of last layer of ResETDNN"), + ) + + parser.add_argument( + "--kernel-size", + nargs="+", + default=3, + type=int, + help=("kernel sizes of encoder conv1d"), + ) + + parser.add_argument( + "--dilation", + nargs="+", + default=1, + type=int, + help=("dilations of encoder conv1d"), + ) + + parser.add_argument( + "--dilation-factor", + default=1, + type=int, + help=("dilation increment wrt previous conv1d layer"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass try: parser.add_argument( - '--norm-layer', default=None, - choices=['batch-norm', 'group-norm', 'instance-norm', 'instance-norm-affine', 'layer-norm'], - help='type of normalization layer') + "--norm-layer", + default=None, + choices=[ + "batch-norm", + "group-norm", + "instance-norm", + "instance-norm-affine", + "layer-norm", + ], + help="type of normalization layer", + ) except: pass - parser.add_argument('--in-norm', default=False, action='store_true', - help='batch normalization at the input') + parser.add_argument( + "--in-norm", + default=False, + action="store_true", + help="batch normalization at the input", + ) try: - parser.add_argument('--wo-norm', default=False, action='store_true', - help='without batch normalization') + parser.add_argument( + "--wo-norm", + default=False, + action="store_true", + help="without batch normalization", + ) except: pass - + try: - parser.add_argument('--norm-after', default=False, action='store_true', - help='batch normalizaton after activation') + parser.add_argument( + "--norm-after", + default=False, + action="store_true", + help="batch normalizaton after activation", + ) except: pass - + try: - parser.add_argument('--dropout-rate', default=0, type=float, - help='dropout') + parser.add_argument("--dropout-rate", default=0, type=float, help="dropout") except: pass try: - parser.add_argument('--in-feats', default=None, type=int, - help=('input feature dimension, ' - 'if None it will try to infer from encoder network')) + parser.add_argument( + "--in-feats", + default=None, + type=int, + help=( + "input feature dimension, " + "if None it will try to infer from encoder network" + ), + ) except: pass if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='TDNN options') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='TDNN options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/torch_na_loader.py b/hyperion/torch/narchs/torch_na_loader.py index ec042646..542742fa 100644 --- a/hyperion/torch/narchs/torch_na_loader.py +++ b/hyperion/torch/narchs/torch_na_loader.py @@ -34,36 +34,36 @@ class TorchNALoader(object): - @staticmethod def load(file_path, extra_objs={}): model_data = torch.load(model_path) - cfg = model_data['model_cfg'] - class_name = cfg['class_name'] - del cfg['class_name'] + cfg = model_data["model_cfg"] + class_name = cfg["class_name"] + del cfg["class_name"] if class_name in globals(): class_obj = globals()[class_name] elif class_name in extra_objs: class_obj = extra_objs[class_name] else: - raise Exception('unknown neural architecture object with class_name=%s' - % (class_name)) + raise Exception( + "unknown neural architecture object with class_name=%s" % (class_name) + ) - state_dict = model_data['model_state_dict'] + state_dict = model_data["model_state_dict"] return class_obj.load(cfg=cfg, state_dict=state_dict) - @staticmethod def load_from_cfg(cfg, state_dict=None, extra_objs={}): - class_name = cfg['class_name'] - del cfg['class_name'] + class_name = cfg["class_name"] + del cfg["class_name"] if class_name in globals(): class_obj = globals()[class_name] elif class_name in extra_objs: class_obj = extra_objs[class_name] else: - raise Exception('unknown neural architecture object with class_name=%s' - % (class_name)) + raise Exception( + "unknown neural architecture object with class_name=%s" % (class_name) + ) return class_obj.load(cfg=cfg, state_dict=state_dict) diff --git a/hyperion/torch/narchs/transformer_encoder_v1.py b/hyperion/torch/narchs/transformer_encoder_v1.py index 51855b49..8d479f24 100644 --- a/hyperion/torch/narchs/transformer_encoder_v1.py +++ b/hyperion/torch/narchs/transformer_encoder_v1.py @@ -14,6 +14,7 @@ from ..layer_blocks import TransformerConv2dSubsampler as Conv2dSubsampler from .net_arch import NetArch + class TransformerEncoderV1(NetArch): """Transformer encoder module. @@ -46,25 +47,30 @@ class TransformerEncoderV1(NetArch): """ - def __init__(self, in_feats, - d_model=256, - num_heads=4, - num_blocks=6, - att_type = 'scaled-dot-prod-v1', - att_context = 25, - ff_type='linear', - d_ff=2048, - ff_kernel_size=1, - ff_dropout_rate=0.1, - pos_dropout_rate=0.1, - att_dropout_rate=0.0, - in_layer_type='conv2d-sub', - rel_pos_enc=False, - causal_pos_enc=False, - hid_act='relu6', - norm_before=True, - concat_after=False, - padding_idx=-1, in_time_dim=-1, out_time_dim=1): + def __init__( + self, + in_feats, + d_model=256, + num_heads=4, + num_blocks=6, + att_type="scaled-dot-prod-v1", + att_context=25, + ff_type="linear", + d_ff=2048, + ff_kernel_size=1, + ff_dropout_rate=0.1, + pos_dropout_rate=0.1, + att_dropout_rate=0.0, + in_layer_type="conv2d-sub", + rel_pos_enc=False, + causal_pos_enc=False, + hid_act="relu6", + norm_before=True, + concat_after=False, + padding_idx=-1, + in_time_dim=-1, + out_time_dim=1, + ): super().__init__() self.in_feats = in_feats @@ -95,22 +101,32 @@ def __init__(self, in_feats, blocks = [] for i in range(num_blocks): - blocks.append(EBlock( - d_model, att_type, num_heads, - ff_type, d_ff, ff_kernel_size, - ff_act=hid_act, ff_dropout_rate=ff_dropout_rate, - att_context=att_context, att_dropout_rate=att_dropout_rate, - rel_pos_enc=rel_pos_enc, causal_pos_enc=causal_pos_enc, - norm_before=norm_before, concat_after=concat_after)) + blocks.append( + EBlock( + d_model, + att_type, + num_heads, + ff_type, + d_ff, + ff_kernel_size, + ff_act=hid_act, + ff_dropout_rate=ff_dropout_rate, + att_context=att_context, + att_dropout_rate=att_dropout_rate, + rel_pos_enc=rel_pos_enc, + causal_pos_enc=causal_pos_enc, + norm_before=norm_before, + concat_after=concat_after, + ) + ) self.blocks = nn.ModuleList(blocks) - + if self.norm_before: self.norm = nn.LayerNorm(d_model) - - # def _make_in_layer(self, in_layer_type, in_feats, d_model, - # dropout_rate, pos_dropout_rate, + # def _make_in_layer(self, in_layer_type, in_feats, d_model, + # dropout_rate, pos_dropout_rate, # padding_idx, time_dim): def _make_in_layer(self): @@ -130,24 +146,23 @@ def _make_in_layer(self): nn.LayerNorm(d_model), nn.Dropout(dropout_rate), hid_act, - pos_enc) + pos_enc, + ) elif self.in_layer_type == "conv2d-sub": self.in_layer = Conv2dSubsampler( - in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim) + in_feats, d_model, hid_act, pos_enc, time_dim=self.in_time_dim + ) elif self.in_layer_type == "embed": self.in_layer = nn.Sequential( - nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), - pos_enc) + nn.Embedding(in_feats, d_model, padding_idx=self.padding_idx), pos_enc + ) elif isinstance(self.in_layer_type, nn.Module): - self.in_layer = nn.Sequential( - in_layer_type, - pos_enc) + self.in_layer = nn.Sequential(in_layer_type, pos_enc) elif self.in_layer_type is None: self.in_layer = pos_enc else: raise ValueError("unknown in_layer_type: " + self.in_layer_type) - def forward(self, x, mask=None, target_shape=None, use_amp=False): if use_amp: with torch.cuda.amp.autocast(): @@ -155,7 +170,6 @@ def forward(self, x, mask=None, target_shape=None, use_amp=False): return self._forward(x, mask, target_shape) - def _forward(self, x, mask=None, target_shape=None): """Forward pass function @@ -176,7 +190,7 @@ def _forward(self, x, mask=None, target_shape=None): if isinstance(x, tuple): x, pos_emb = x - b_args = {'pos_emb': pos_emb} + b_args = {"pos_emb": pos_emb} else: b_args = {} @@ -194,44 +208,44 @@ def _forward(self, x, mask=None, target_shape=None): return x, mask - def get_config(self): - """ Gets network config + """Gets network config Returns: dictionary with config params """ - config = {'in_feats': self.in_feats, - 'd_model': self.d_model, - 'num_heads': self.num_heads, - 'num_blocks': self.num_blocks, - 'att_type': self.att_type, - 'att_context': self.att_context, - 'ff_type': self.ff_type, - 'd_ff': self.d_ff, - 'ff_kernel_size': self.ff_kernel_size, - 'ff_dropout_rate': self.ff_dropout_rate, - 'att_dropout_rate': self.att_dropout_rate, - 'pos_dropout_rate': self.pos_dropout_rate, - 'in_layer_type': self.in_layer_type, - 'rel_pos_enc': self.rel_pos_enc, - 'causal_pos_enc': self.causal_pos_enc, - 'hid_act': self.hid_act, - 'norm_before': self.norm_before, - 'concat_after': self.concat_after, - 'padding_idx': self.padding_idx, - 'in_time_dim': self.in_time_dim, - 'out_time_dim': self.out_time_dim } - + config = { + "in_feats": self.in_feats, + "d_model": self.d_model, + "num_heads": self.num_heads, + "num_blocks": self.num_blocks, + "att_type": self.att_type, + "att_context": self.att_context, + "ff_type": self.ff_type, + "d_ff": self.d_ff, + "ff_kernel_size": self.ff_kernel_size, + "ff_dropout_rate": self.ff_dropout_rate, + "att_dropout_rate": self.att_dropout_rate, + "pos_dropout_rate": self.pos_dropout_rate, + "in_layer_type": self.in_layer_type, + "rel_pos_enc": self.rel_pos_enc, + "causal_pos_enc": self.causal_pos_enc, + "hid_act": self.hid_act, + "norm_before": self.norm_before, + "concat_after": self.concat_after, + "padding_idx": self.padding_idx, + "in_time_dim": self.in_time_dim, + "out_time_dim": self.out_time_dim, + } + base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - def in_context(self): return (self.att_context, self.att_context) def in_shape(self): """Input shape for network - + Returns: Tuple describing input shape """ @@ -240,7 +254,6 @@ def in_shape(self): else: return (None, self.in_feats, None) - def out_shape(self, in_shape=None): """Infers the network output shape given the input shape @@ -261,8 +274,8 @@ def out_shape(self, in_shape=None): out_t = None else: if isinstance(self.in_layer, Conv2dSubsampler): - #out_t = in_t//4 - out_t = ((in_t - 1)//2 - 1)//2 + # out_t = in_t//4 + out_t = ((in_t - 1) // 2 - 1) // 2 else: out_t = in_t @@ -271,11 +284,9 @@ def out_shape(self, in_shape=None): else: return (batch_size, self.d_model, out_t) - - @staticmethod def filter_args(**kwargs): - """ Filters arguments correspondin to TransformerXVector + """Filters arguments correspondin to TransformerXVector from args dictionary Args: @@ -284,112 +295,147 @@ def filter_args(**kwargs): Returns: args dictionary """ - - valid_args = ('num_blocks', - 'in_feats', - 'd_model', - 'num_heads', - 'att_type', - 'att_context', - 'ff_type', - 'd_ff', - 'ff_kernel_size', - 'ff_dropout_rate', - 'pos_dropout_rate', - 'att_dropout_rate', - 'in_layer_type', - 'hid_act', - 'rel_pos_enc', - 'causal_pos_enc', - 'concat_after') - - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - + valid_args = ( + "num_blocks", + "in_feats", + "d_model", + "num_heads", + "att_type", + "att_context", + "ff_type", + "d_ff", + "ff_kernel_size", + "ff_dropout_rate", + "pos_dropout_rate", + "att_dropout_rate", + "in_layer_type", + "hid_act", + "rel_pos_enc", + "causal_pos_enc", + "concat_after", + ) + + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None, in_feats=False): """Adds Transformer config parameters to argparser - + Args: parser: argparse object prefix: prefix string to add to the argument names """ if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") if in_feats: parser.add_argument( - '--in-feats', type=int, default=80, - help=('input feature dimension')) - - parser.add_argument('--num-blocks', - default=6, type=int, - help=('number of tranformer blocks')) - - parser.add_argument('--d-model', - default=512, type=int, - help=('encoder layer sizes')) - - parser.add_argument('--num-heads', - default=4, type=int, - help=('number of heads in self-attention layers')) - - parser.add_argument('--att-type', - default='scaled-dot-prod-v1', - choices=['scaled-dot-prod-v1', 'local-scaled-dot-prod-v1'], - help=('type of self-attention')) - - parser.add_argument('--att-context', - default=25, type=int, - help=('context size when using local attention')) - - parser.add_argument('--ff-type', - default='linear', choices=['linear', 'conv1dx2', 'conv1dlinear'], - help=('type of feed forward layers in transformer block')) - - parser.add_argument('--d-ff', - default=2048, type=int, - help=('size middle layer in feed forward block')) - - parser.add_argument('--ff-kernel-size', - default=3, type=int, - help=('kernel size in convolutional feed forward block')) + "--in-feats", type=int, default=80, help=("input feature dimension") + ) + + parser.add_argument( + "--num-blocks", default=6, type=int, help=("number of tranformer blocks") + ) + + parser.add_argument( + "--d-model", default=512, type=int, help=("encoder layer sizes") + ) + + parser.add_argument( + "--num-heads", + default=4, + type=int, + help=("number of heads in self-attention layers"), + ) + + parser.add_argument( + "--att-type", + default="scaled-dot-prod-v1", + choices=["scaled-dot-prod-v1", "local-scaled-dot-prod-v1"], + help=("type of self-attention"), + ) + + parser.add_argument( + "--att-context", + default=25, + type=int, + help=("context size when using local attention"), + ) + + parser.add_argument( + "--ff-type", + default="linear", + choices=["linear", "conv1dx2", "conv1dlinear"], + help=("type of feed forward layers in transformer block"), + ) + + parser.add_argument( + "--d-ff", + default=2048, + type=int, + help=("size middle layer in feed forward block"), + ) + + parser.add_argument( + "--ff-kernel-size", + default=3, + type=int, + help=("kernel size in convolutional feed forward block"), + ) try: - parser.add_argument('--hid-act', default='relu6', - help='hidden activation') + parser.add_argument("--hid-act", default="relu6", help="hidden activation") except: pass - parser.add_argument('--pos-dropout-rate', default=0.1, type=float, - help='positional encoder dropout') - parser.add_argument('--att-dropout-rate', default=0, type=float, - help='self-att dropout') - parser.add_argument('--ff-dropout-rate', default=0.1, type=float, - help='feed-forward layer dropout') - - parser.add_argument('--in-layer-type', - default='linear', choices=['linear', 'conv2d-sub'], - help=('type of input layer')) - - parser.add_argument('--rel-pos-enc', default=False, action='store_true', - help='use relative positional encoder') - - parser.add_argument('--causal-pos-enc', default=False, action='store_true', - help='relative positional encodings are zero when attending to the future') - - parser.add_argument('--concat-after', default=False, action='store_true', - help='concatenate attention input and output instead of adding') + parser.add_argument( + "--pos-dropout-rate", + default=0.1, + type=float, + help="positional encoder dropout", + ) + parser.add_argument( + "--att-dropout-rate", default=0, type=float, help="self-att dropout" + ) + parser.add_argument( + "--ff-dropout-rate", + default=0.1, + type=float, + help="feed-forward layer dropout", + ) + + parser.add_argument( + "--in-layer-type", + default="linear", + choices=["linear", "conv2d-sub"], + help=("type of input layer"), + ) + + parser.add_argument( + "--rel-pos-enc", + default=False, + action="store_true", + help="use relative positional encoder", + ) + + parser.add_argument( + "--causal-pos-enc", + default=False, + action="store_true", + help="relative positional encodings are zero when attending to the future", + ) + + parser.add_argument( + "--concat-after", + default=False, + action="store_true", + help="concatenate attention input and output instead of adding", + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='transformer encoder options') - - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='transformer encoder options') add_argparse_args = add_class_args diff --git a/hyperion/torch/narchs/xvector_classif.py b/hyperion/torch/narchs/xvector_classif.py index 74b29022..e87c3db1 100644 --- a/hyperion/torch/narchs/xvector_classif.py +++ b/hyperion/torch/narchs/xvector_classif.py @@ -11,15 +11,22 @@ from ..layers import ActivationFactory as AF from .net_arch import NetArch -class XVectorClassifV1(NetArch): - def __init__(self, input_units, num_classes, embed_dim=512, - num_hid_layers=2, - hid_act='relu', outputs='logits', - use_batchnorm=True, dropout_rate=0): +class XVectorClassifV1(NetArch): + def __init__( + self, + input_units, + num_classes, + embed_dim=512, + num_hid_layers=2, + hid_act="relu", + outputs="logits", + use_batchnorm=True, + dropout_rate=0, + ): super(XVectorClassifV1, self).__init__() - assert num_hid_layers >= 1, 'num_hid_layers (%d < 1)' % num_hid_layers + assert num_hid_layers >= 1, "num_hid_layers (%d < 1)" % num_hid_layers self.num_hid_layers = num_hid_layers self.input_units = input_units @@ -28,7 +35,7 @@ def __init__(self, input_units, num_classes, embed_dim=512, self.use_batchnorm = use_batchnorm self.dropout_rate = dropout_rate self.outputs = outputs - + if isinstance(hid_units, list): assert num_hid_layers == len(embed_dim) else: @@ -36,14 +43,14 @@ def __init__(self, input_units, num_classes, embed_dim=512, units = [input_units] + embed_dim - #fully connected layers + # fully connected layers fc_layers = [] - for i in range(1, num_hid_layers+1): - fc_layers.append(Linear(units[i-1], units[i])) + for i in range(1, num_hid_layers + 1): + fc_layers.append(Linear(units[i - 1], units[i])) self.fc_layers = nn.ModuleList(fc_layers) - #hidden activations + # hidden activations self.hid_acts = None if hid_act is not None: hid_acts = [] @@ -51,8 +58,8 @@ def __init__(self, input_units, num_classes, embed_dim=512, hid_act = AF.create(hid_act) hid_acts.append(hid_act) self.hid_acts = nn.ModuleList(hid_acts) - - #batch normalization + + # batch normalization self.batchnorm_layers = None if use_batchnorm: batchnorm_layers = [] @@ -71,14 +78,12 @@ def __init__(self, input_units, num_classes, embed_dim=512, # output layers self.logits_layer = Linear(units[-1], num_classes) - - def forward(self, x): for l in range(self.num_hid_layers): if self.use_batchnorm: x = self.batchnorm_layers[l](x) - + x = self.fc_layers[l](x) if self.hid_acts is not None: x = self.hid_acts[l](x) @@ -86,13 +91,10 @@ def forward(self, x): if self.dropout_rate > 0: x = self.dropout_layers[l](x) - y = self.logits_layer(x) return y - - def extract_embed(self, x, embed_layers=0): if isinstance(embed_layers, int): @@ -112,35 +114,32 @@ def extract_embed(self, x, embed_layers=0): if l == last_embed_layer: break - + if self.hid_acts is not None: x = self.hid_acts[l](x) if self.dropout_rate > 0: x = self.dropout_layers[l](x) - y = torch.cat((embed_list), dim=-1) return y - - - + def get_config(self): - + if self.hid_acts is None: hid_act = None else: hid_act = AF.get_config(self.hid_acts[0]) - config = {'num_hid_layers': self.num_hid_layers, - 'num_classes': self.num_classes, - 'embed_dim': self.embed_dim, - 'input_units': self.input_units, - 'use_batchnorm': self.use_batchnorm, - 'dropout_rate': self.dropout_rate, - 'hid_act': hid_act } - + config = { + "num_hid_layers": self.num_hid_layers, + "num_classes": self.num_classes, + "embed_dim": self.embed_dim, + "input_units": self.input_units, + "use_batchnorm": self.use_batchnorm, + "dropout_rate": self.dropout_rate, + "hid_act": hid_act, + } + base_config = super(XVectorClassifV1, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - diff --git a/hyperion/torch/optim/factory.py b/hyperion/torch/optim/factory.py index a817e153..4fa7b186 100644 --- a/hyperion/torch/optim/factory.py +++ b/hyperion/torch/optim/factory.py @@ -11,234 +11,315 @@ import torch.optim as optim from .radam import RAdam -class OptimizerFactory(object): +class OptimizerFactory(object): @staticmethod - def create(params, opt_type, lr, momentum=0, - beta1=0.9, beta2=0.99, rho=0.9, eps=1e-8, weight_decay=0, - amsgrad=False, nesterov=False, - lambd=0.0001, asgd_alpha=0.75, t0=1000000.0, - rmsprop_alpha=0.99, centered=False, - lr_decay=0, init_acc_val=0, max_iter=20, oss=False): + def create( + params, + opt_type, + lr, + momentum=0, + beta1=0.9, + beta2=0.99, + rho=0.9, + eps=1e-8, + weight_decay=0, + amsgrad=False, + nesterov=False, + lambd=0.0001, + asgd_alpha=0.75, + t0=1000000.0, + rmsprop_alpha=0.99, + centered=False, + lr_decay=0, + init_acc_val=0, + max_iter=20, + oss=False, + ): kwargs = locals() base_opt = None - if opt_type == 'sgd': - valid_args = ('lr', 'momentum', 'weight_decay', 'nesterov') + if opt_type == "sgd": + valid_args = ("lr", "momentum", "weight_decay", "nesterov") opt_args = filter_args(valid_args, kwargs) - opt_args['dampening']=0 + opt_args["dampening"] = 0 base_opt = optim.SGD # return optim.SGD(params, lr, momentum=momentum, dampening=0, # weight_decay=weight_decay, nesterov=nesterov) - if opt_type == 'adam': + if opt_type == "adam": betas = (beta1, beta2) - valid_args = ('lr', 'eps', 'weight_decay', 'amsgrad') + valid_args = ("lr", "eps", "weight_decay", "amsgrad") opt_args = filter_args(valid_args, kwargs) - opt_args['betas'] = betas + opt_args["betas"] = betas base_opt = optim.Adam # return optim.Adam( # params, lr, betas=(beta1, beta2), eps=eps, # weight_decay=weight_decay, amsgrad=amsgrad) - if opt_type == 'adamw': + if opt_type == "adamw": betas = (beta1, beta2) - valid_args = ('lr', 'eps', 'weight_decay', 'amsgrad') + valid_args = ("lr", "eps", "weight_decay", "amsgrad") opt_args = filter_args(valid_args, kwargs) - opt_args['betas'] = betas + opt_args["betas"] = betas base_opt = optim.AdamW - if opt_type == 'radam': + if opt_type == "radam": betas = (beta1, beta2) - valid_args = ('lr', 'eps', 'weight_decay') + valid_args = ("lr", "eps", "weight_decay") opt_args = filter_args(valid_args, kwargs) - opt_args['betas'] = betas + opt_args["betas"] = betas base_opt = RAdam # return RAdam( # params, lr, betas=(beta1, beta2), eps=eps, # weight_decay=weight_decay) - - if opt_type == 'adadelta': - valid_args = ('lr', 'eps', 'weight_decay', 'rho') + if opt_type == "adadelta": + valid_args = ("lr", "eps", "weight_decay", "rho") opt_args = filter_args(valid_args, kwargs) base_opt = optim.Adadelta # return optim.Adadelta(params, lr, rho=rho, eps=eps, # weight_decay=weight_decay) - if opt_type == 'adagrad': - valid_args = ('lr', 'lr_decay', 'weight_decay') + if opt_type == "adagrad": + valid_args = ("lr", "lr_decay", "weight_decay") opt_args = filter_args(valid_args, kwargs) - opt_args['initial_accumulator_value'] = init_acc_val + opt_args["initial_accumulator_value"] = init_acc_val base_opt = optim.Adagrad # return optim.Adagrad( # params, lr, lr_decay=lr_decay, # weight_decay=weight_decay, initial_accumulator_value=init_acc_val) - - if opt_type == 'sparse_adam': + if opt_type == "sparse_adam": betas = (beta1, beta2) - valid_args = ('lr', 'eps') + valid_args = ("lr", "eps") opt_args = filter_args(valid_args, kwargs) - opt_args['betas'] = betas + opt_args["betas"] = betas base_opt = optim.SparseAdam # return optim.SparseAdam(params, lr, betas=(beta1, beta2), eps=eps) - if opt_type == 'adamax': + if opt_type == "adamax": betas = (beta1, beta2) - valid_args = ('lr', 'eps', 'weight_decay') + valid_args = ("lr", "eps", "weight_decay") opt_args = filter_args(valid_args, kwargs) - opt_args['betas'] = betas + opt_args["betas"] = betas base_opt = optim.Adamax # return optim.Adamax(params, lr, betas=(beta1, beta2), eps=eps, # weight_decay=weight_decay) - if opt_type == 'asgd': - valid_args = ('lr', 'lambd', 't0', 'weight_decay') + if opt_type == "asgd": + valid_args = ("lr", "lambd", "t0", "weight_decay") opt_args = filter_args(valid_args, kwargs) - opt_args['alpha'] = asgd_alpha + opt_args["alpha"] = asgd_alpha base_opt = optim.ASGD # return optim.ASGD(params, lr, lambd=lambd, alpha=asgd_alpha, t0=t0, # weight_decay=weight_decay) - if opt_type == 'lbfgs': - valid_args = ('lr', 'max_iter') + if opt_type == "lbfgs": + valid_args = ("lr", "max_iter") opt_args = filter_args(valid_args, kwargs) base_opt = optim.LBFGS # return optim.LBFGS( # params, lr, max_iter=max_iter) - if opt_type == 'rmsprop': - valid_args = ('lr', 'eps', 'momentum', 'weight_decay', 'centered') + if opt_type == "rmsprop": + valid_args = ("lr", "eps", "momentum", "weight_decay", "centered") opt_args = filter_args(valid_args, kwargs) - opt_args['alpha'] = rmsprop_alpha + opt_args["alpha"] = rmsprop_alpha base_opt = optim.RMSprop # return optim.RMSprop( # params, lr, alpha=rmsprop_alpha, eps=eps, # weight_decay=weight_decay, momentum=momentum, centered=centered) - if opt_type == 'rprop': - opts_args = {'lr': lr, 'etas': (0.5, 1.2), 'step_sizes': (1e-06, 50)} + if opt_type == "rprop": + opts_args = {"lr": lr, "etas": (0.5, 1.2), "step_sizes": (1e-06, 50)} base_opt = optim.Rprop # return optim.Rprop(params, lr, etas=(0.5, 1.2), step_sizes=(1e-06, 50)) if base_opt is None: - raise Exception('unknown optimizer %s' % opt_type) + raise Exception("unknown optimizer %s" % opt_type) if oss: from fairscale.optim.oss import OSS - logging.info('Optimizer uses OSS') + + logging.info("Optimizer uses OSS") return OSS(params, base_opt, **opt_args) - - return base_opt(params, **opt_args) + return base_opt(params, **opt_args) @staticmethod def filter_args(**kwargs): - valid_args = ('opt_type', 'lr', 'momentum', 'beta1', 'beta2', - 'rho', 'eps', 'weight_decay', 'amsgrad', 'nesterov', - 'lambd','asgd_alpha','t0','rmsprop_alpha', - 'centered','lr_decay','init_acc_val','max_iter', 'oss') + valid_args = ( + "opt_type", + "lr", + "momentum", + "beta1", + "beta2", + "rho", + "eps", + "weight_decay", + "amsgrad", + "nesterov", + "lambd", + "asgd_alpha", + "t0", + "rmsprop_alpha", + "centered", + "lr_decay", + "init_acc_val", + "max_iter", + "oss", + ) return filter_args(valid_args, kwargs) - @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") parser.add_argument( - '--opt-type', type=str.lower, - default='adam', - choices=['sgd','adam', 'adamw', 'radam', 'adadelta', 'adagrad', - 'sparse_adam', - 'adamax', 'asgd', 'lbfgs', 'rmsprop','rprop'], - help=('Optimizers: SGD, Adam, AdaDelta, AdaGrad, SparseAdam ' - 'AdaMax, ASGD, LFGS, RMSprop, Rprop')) - parser.add_argument( - '--lr' , - default=0.001, type=float, - help=('Initial learning rate')) + "--opt-type", + type=str.lower, + default="adam", + choices=[ + "sgd", + "adam", + "adamw", + "radam", + "adadelta", + "adagrad", + "sparse_adam", + "adamax", + "asgd", + "lbfgs", + "rmsprop", + "rprop", + ], + help=( + "Optimizers: SGD, Adam, AdaDelta, AdaGrad, SparseAdam " + "AdaMax, ASGD, LFGS, RMSprop, Rprop" + ), + ) parser.add_argument( - '--momentum', default=0.6, type=float, - help=('Momentum')) + "--lr", default=0.001, type=float, help=("Initial learning rate") + ) + parser.add_argument("--momentum", default=0.6, type=float, help=("Momentum")) parser.add_argument( - '--beta1', default=0.9, type=float, - help=('Beta_1 in Adam optimizers, ' - 'coefficient used for computing ' - 'running averages of gradient')) + "--beta1", + default=0.9, + type=float, + help=( + "Beta_1 in Adam optimizers, " + "coefficient used for computing " + "running averages of gradient" + ), + ) parser.add_argument( - '--beta2', default=0.99, type=float, - help=('Beta_2 in Adam optimizers' - 'coefficient used for computing ' - 'running averages of gradient square')) + "--beta2", + default=0.99, + type=float, + help=( + "Beta_2 in Adam optimizers" + "coefficient used for computing " + "running averages of gradient square" + ), + ) parser.add_argument( - '--rho', default=0.9, type=float, - help=('Rho in AdaDelta,' - 'coefficient used for computing a ' - 'running average of squared gradients')) + "--rho", + default=0.9, + type=float, + help=( + "Rho in AdaDelta," + "coefficient used for computing a " + "running average of squared gradients" + ), + ) parser.add_argument( - '--eps', default=1e-8, type=float, - help=('Epsilon in RMSprop and Adam optimizers ' - 'term added to the denominator ' - 'to improve numerical stability')) + "--eps", + default=1e-8, + type=float, + help=( + "Epsilon in RMSprop and Adam optimizers " + "term added to the denominator " + "to improve numerical stability" + ), + ) parser.add_argument( - '--weight-decay', default=1e-6, type=float, - help=('L2 regularization coefficient')) + "--weight-decay", + default=1e-6, + type=float, + help=("L2 regularization coefficient"), + ) parser.add_argument( - '--amsgrad', default=False, - action='store_true', - help=('AMSGrad variant of Adam')) + "--amsgrad", + default=False, + action="store_true", + help=("AMSGrad variant of Adam"), + ) parser.add_argument( - '--nesterov', default=False, - action='store_true', - help=('Use Nesterov momentum in SGD')) + "--nesterov", + default=False, + action="store_true", + help=("Use Nesterov momentum in SGD"), + ) parser.add_argument( - '--lambd', default=0.0001, type=float, - help=('decay term in ASGD')) + "--lambd", default=0.0001, type=float, help=("decay term in ASGD") + ) parser.add_argument( - '--asgd-alpha', - default=0.75, type=float, - help=('power for eta update in ASGD')) + "--asgd-alpha", + default=0.75, + type=float, + help=("power for eta update in ASGD"), + ) parser.add_argument( - '--t0', default=1e6, type=float, - help=('point at which to start averaging in ASGD')) + "--t0", + default=1e6, + type=float, + help=("point at which to start averaging in ASGD"), + ) parser.add_argument( - '--rmsprop-alpha', default=0.99, type=float, - help=('smoothing constant in RMSprop')) - + "--rmsprop-alpha", + default=0.99, + type=float, + help=("smoothing constant in RMSprop"), + ) + parser.add_argument( - '--centered', default=False, - action='store_true', - help=('Compute centered RMSprop, gradient normalized ' - 'by its variance')) + "--centered", + default=False, + action="store_true", + help=("Compute centered RMSprop, gradient normalized " "by its variance"), + ) parser.add_argument( - '--lr-decay', default=1e-6, type=float, - help=('Learning rate decay in AdaGrad optimizer')) - + "--lr-decay", + default=1e-6, + type=float, + help=("Learning rate decay in AdaGrad optimizer"), + ) + parser.add_argument( - '--init-acc-val', default=0, type=float, - help=('Init accum value in Adagrad')) + "--init-acc-val", + default=0, + type=float, + help=("Init accum value in Adagrad"), + ) parser.add_argument( - '--max-iter', default=20, type=int, - help=('max iterations in LBGS')) + "--max-iter", default=20, type=int, help=("max iterations in LBGS") + ) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - # help='optimizer options') - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + # help='optimizer options') add_argparse_args = add_class_args diff --git a/hyperion/torch/optim/fgsm.py b/hyperion/torch/optim/fgsm.py index ab830d38..f0a0ad93 100644 --- a/hyperion/torch/optim/fgsm.py +++ b/hyperion/torch/optim/fgsm.py @@ -7,13 +7,14 @@ import torch from torch.optim.optimizer import Optimizer + class FGSM(Optimizer): """Implements Fast Gradient Sign Method""" + def __init__(self, params, epsilon): defaults = dict(epsilon=epsilon) super(FGSM, self).__init__(params, defaults) - def step(self, closure=None): """Performs a single optimization step. @@ -26,13 +27,12 @@ def step(self, closure=None): loss = closure() for group in self.param_groups: - epsilon = group['epsilon'] + epsilon = group["epsilon"] - for p in group['params']: + for p in group["params"]: if p.grad is None: continue d_p = p.grad.data p.data.add_(-epsilon, d_p.sign()) return loss - diff --git a/hyperion/torch/optim/radam.py b/hyperion/torch/optim/radam.py index 1cd79af3..459646c1 100644 --- a/hyperion/torch/optim/radam.py +++ b/hyperion/torch/optim/radam.py @@ -1,45 +1,60 @@ """ Code taken from https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py """ -# +# import math import torch from torch.optim.optimizer import Optimizer, required + class RAdam(Optimizer): - """ Implements Rectified Adam optimzier (RAdam) from + """Implements Rectified Adam optimzier (RAdam) from - Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, Xiaodong Liu, - Jianfeng Gao, and Jiawei Han. "On the Variance of the Adaptive - Learning Rate and Beyond." arXiv preprint arXiv:1908.03265 (2019). + Liyuan Liu, Haoming Jiang, Pengcheng He, Weizhu Chen, Xiaodong Liu, + Jianfeng Gao, and Jiawei Han. "On the Variance of the Adaptive + Learning Rate and Beyond." arXiv preprint arXiv:1908.03265 (2019). - code taken from: - https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py + code taken from: + https://github.com/LiyuanLucasLiu/RAdam/blob/master/radam/radam.py """ - def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, - weight_decay=0, degenerated_to_sgd=True): + def __init__( + self, + params, + lr=1e-3, + betas=(0.9, 0.999), + eps=1e-8, + weight_decay=0, + degenerated_to_sgd=True, + ): if not 0.0 <= lr: raise ValueError("Invalid learning rate: {}".format(lr)) if not 0.0 <= eps: raise ValueError("Invalid epsilon value: {}".format(eps)) if not 0.0 <= betas[0] < 1.0: - raise ValueError("Invalid beta parameter at index 0: {}".format( - betas[0])) + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) if not 0.0 <= betas[1] < 1.0: - raise ValueError("Invalid beta parameter at index 1: {}".format( - betas[1])) - + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + self.degenerated_to_sgd = degenerated_to_sgd - if isinstance(params, (list, tuple)) and len(params) > 0 and isinstance( - params[0], dict): + if ( + isinstance(params, (list, tuple)) + and len(params) > 0 + and isinstance(params[0], dict) + ): for param in params: - if 'betas' in param and ( - param['betas'][0] != betas[0] or param['betas'][1] != betas[1]): - param['buffer'] = [[None, None, None] for _ in range(10)] - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, - buffer=[[None, None, None] for _ in range(10)]) + if "betas" in param and ( + param["betas"][0] != betas[0] or param["betas"][1] != betas[1] + ): + param["buffer"] = [[None, None, None] for _ in range(10)] + defaults = dict( + lr=lr, + betas=betas, + eps=eps, + weight_decay=weight_decay, + buffer=[[None, None, None] for _ in range(10)], + ) super().__init__(params, defaults) def __setstate__(self, state): @@ -53,67 +68,74 @@ def step(self, closure=None): for group in self.param_groups: - for p in group['params']: + for p in group["params"]: if p.grad is None: continue grad = p.grad.data.float() if grad.is_sparse: - raise RuntimeError('RAdam does not support sparse gradients') + raise RuntimeError("RAdam does not support sparse gradients") p_data_fp32 = p.data.float() state = self.state[p] if len(state) == 0: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_data_fp32) - state['exp_avg_sq'] = torch.zeros_like(p_data_fp32) + state["step"] = 0 + state["exp_avg"] = torch.zeros_like(p_data_fp32) + state["exp_avg_sq"] = torch.zeros_like(p_data_fp32) else: - state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32) - state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32) + state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32) + state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32) - exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq'] - beta1, beta2 = group['betas'] + exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"] + beta1, beta2 = group["betas"] - exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value= 1 - beta2) - exp_avg.mul_(beta1).add_(grad, alpha= 1 - beta1) + exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2) + exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1) - state['step'] += 1 - buffered = group['buffer'][int(state['step'] % 10)] - if state['step'] == buffered[0]: + state["step"] += 1 + buffered = group["buffer"][int(state["step"] % 10)] + if state["step"] == buffered[0]: N_sma, step_size = buffered[1], buffered[2] else: - buffered[0] = state['step'] - beta2_t = beta2 ** state['step'] + buffered[0] = state["step"] + beta2_t = beta2 ** state["step"] N_sma_max = 2 / (1 - beta2) - 1 - N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t) + N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t) buffered[1] = N_sma # more conservative since it's an approximated value if N_sma >= 5: - step_size = (math.sqrt( - (1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * - (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / - (1 - beta1 ** state['step'])) + step_size = math.sqrt( + (1 - beta2_t) + * (N_sma - 4) + / (N_sma_max - 4) + * (N_sma - 2) + / N_sma + * N_sma_max + / (N_sma_max - 2) + ) / (1 - beta1 ** state["step"]) elif self.degenerated_to_sgd: - step_size = 1.0 / (1 - beta1 ** state['step']) + step_size = 1.0 / (1 - beta1 ** state["step"]) else: step_size = -1 buffered[2] = step_size # more conservative since it's an approximated value if N_sma >= 5: - if group['weight_decay'] != 0: + if group["weight_decay"] != 0: p_data_fp32.add_( - p_data_fp32, alpha= -group['weight_decay'] * group['lr']) - denom = exp_avg_sq.sqrt().add_(group['eps']) - p_data_fp32.addcdiv_(exp_avg, denom, value = -step_size * group['lr']) + p_data_fp32, alpha=-group["weight_decay"] * group["lr"] + ) + denom = exp_avg_sq.sqrt().add_(group["eps"]) + p_data_fp32.addcdiv_(exp_avg, denom, value=-step_size * group["lr"]) p.data.copy_(p_data_fp32) elif step_size > 0: - if group['weight_decay'] != 0: + if group["weight_decay"] != 0: p_data_fp32.add_( - p_data_fp32, alpha= -group['weight_decay'] * group['lr']) - p_data_fp32.add_(exp_avg, alpha= -step_size * group['lr']) + p_data_fp32, alpha=-group["weight_decay"] * group["lr"] + ) + p_data_fp32.add_(exp_avg, alpha=-step_size * group["lr"]) p.data.copy_(p_data_fp32) return loss diff --git a/hyperion/torch/torch_defs.py b/hyperion/torch/torch_defs.py index 9c4d973a..a567de50 100644 --- a/hyperion/torch/torch_defs.py +++ b/hyperion/torch/torch_defs.py @@ -6,17 +6,23 @@ import torch -str2torch_dtype = { 'float32': torch.float32, - 'float64': torch.float64, - 'float16': torch.float16} +str2torch_dtype = { + "float32": torch.float32, + "float64": torch.float64, + "float16": torch.float16, +} + +torch_dtype2str = { + torch.float32: "float32", + torch.float64: "float64", + torch.float16: "float16", +} -torch_dtype2str = { torch.float32: 'float32', - torch.float64: 'float64', - torch.float16: 'float16'} def floatstr_torch(): return torch_dtype2str[torch.get_default_dtype()] + def float_torch(): return torch.get_default_dtype() @@ -24,8 +30,5 @@ def float_torch(): def set_float_torch(float_torch): if isinstance(float_torch, str): float_torch = str2torch_dtype[float_torch] - - torch.set_default_dtype(float_torch) - - + torch.set_default_dtype(float_torch) diff --git a/hyperion/torch/torch_model.py b/hyperion/torch/torch_model.py index 49c51c4e..66c4d028 100644 --- a/hyperion/torch/torch_model.py +++ b/hyperion/torch/torch_model.py @@ -10,38 +10,32 @@ class TorchModel(nn.Module): - def get_config(self): - config = { - 'class_name': self.__class__.__name__} - + config = {"class_name": self.__class__.__name__} + return config - def copy(self): return deepcopy(self) - def save(self, file_path): file_dir = os.path.dirname(file_path) - if not(os.path.isdir(file_dir)): + if not (os.path.isdir(file_dir)): os.makedirs(file_dir, exist_ok=True) config = self.get_config() - torch.save({'model_cfg': self.get_config(), - 'model_state_dict': self.state_dict()}) - + torch.save( + {"model_cfg": self.get_config(), "model_state_dict": self.state_dict()} + ) def freeze(self): for param in self.parameters(): param.requires_grad = False - def unfreeze(self): for param in self.parameters(): param.requires_grad = True - @staticmethod def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): model_data = None @@ -49,42 +43,40 @@ def _load_cfg_state_dict(file_path=None, cfg=None, state_dict=None): assert file_path is not None model_data = torch.load(file_path) if cfg is None: - cfg = model_data['model_cfg'] + cfg = model_data["model_cfg"] if state_dict is None and model_data is not None: - state_dict = model_data['model_state_dict'] + state_dict = model_data["model_state_dict"] - if 'class_name' in cfg: - del cfg['class_name'] + if "class_name" in cfg: + del cfg["class_name"] return cfg, state_dict - @classmethod def load(cls, file_path=None, cfg=None, state_dict=None): - cfg, state_dict = TorchModel._load_cfg_state_dict( - file_path, cfg, state_dict) - + cfg, state_dict = TorchModel._load_cfg_state_dict(file_path, cfg, state_dict) + model = cls(**cfg) if state_dict is not None: model.load_state_dict(state_dict) return model - def get_reg_loss(self): return 0 - def get_loss(self): return 0 - @property def device(self): - devices = ({param.device for param in self.parameters()} | - {buf.device for buf in self.buffers()}) + devices = {param.device for param in self.parameters()} | { + buf.device for buf in self.buffers() + } if len(devices) != 1: raise RuntimeError( - 'Cannot determine device: {} different devices found'.format( - len(devices))) + "Cannot determine device: {} different devices found".format( + len(devices) + ) + ) return next(iter(devices)) diff --git a/hyperion/torch/torch_model_loader.py b/hyperion/torch/torch_model_loader.py index fdc04b04..4a4007ce 100644 --- a/hyperion/torch/torch_model_loader.py +++ b/hyperion/torch/torch_model_loader.py @@ -13,30 +13,29 @@ class TorchModelLoader(object): - @staticmethod def load(file_path, extra_objs={}, map_location=None): if map_location is None: - map_location=torch.device('cpu') + map_location = torch.device("cpu") model_data = torch.load(file_path, map_location=map_location) - cfg = model_data['model_cfg'] - class_name = cfg['class_name'] - del cfg['class_name'] + cfg = model_data["model_cfg"] + class_name = cfg["class_name"] + del cfg["class_name"] if class_name in globals(): class_obj = globals()[class_name] elif class_name in extra_objs: class_obs = extra_objs[class_name] else: - raise Exception('unknown object with class_name=%s' % (class_name)) + raise Exception("unknown object with class_name=%s" % (class_name)) - state_dict = model_data['model_state_dict'] + state_dict = model_data["model_state_dict"] - if 'n_averaged' in state_dict: - del state_dict['n_averaged'] + if "n_averaged" in state_dict: + del state_dict["n_averaged"] - p = re.compile('^module\.') + p = re.compile("^module\.") num_tries = 3 for tries in range(num_tries): try: @@ -44,7 +43,7 @@ def load(file_path, extra_objs={}, map_location=None): except RuntimeError as err: # remove module prefix when is trained with dataparallel if tries == num_tries - 1: - #if it failed the 3 trials raise exception + # if it failed the 3 trials raise exception raise err # remove module prefix when is trained with dataparallel - state_dict = ODict((p.sub('',k), v) for k,v in state_dict.items()) + state_dict = ODict((p.sub("", k), v) for k, v in state_dict.items()) diff --git a/hyperion/torch/trainers/ae_trainer.py b/hyperion/torch/trainers/ae_trainer.py index 7ed6e7be..8646c79f 100644 --- a/hyperion/torch/trainers/ae_trainer.py +++ b/hyperion/torch/trainers/ae_trainer.py @@ -18,94 +18,99 @@ class AETrainer(TorchTrainer): """Auto-encoder trainer class - Attributes: - model: model object. - loss: nn.Module loss class - optim: pytorch optimizer object or optimizer options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: model object. + loss: nn.Module loss class + optim: pytorch optimizer object or optimizer options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - loss, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): + + def __init__( + self, + model, + loss, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): if loss is None: loss = nn.MSELoss() - super().__init__(model, - loss, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + super().__init__( + model, + loss, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) def train_epoch(self, data_loader): """Training epoch loop - Args: - data_loader: pytorch data loader returning features and class labels. + Args: + data_loader: pytorch data loader returning features and class labels. """ metric_acc = MetricAcc(device=self.device) @@ -138,19 +143,19 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, data) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - #total_batches += 1 + # total_batches += 1 logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -159,10 +164,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): @@ -175,7 +180,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): output = self.model(data) loss = self.loss(output, data) - batch_metrics['loss'] = loss.mean().item() + batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, data) diff --git a/hyperion/torch/trainers/dvae_trainer.py b/hyperion/torch/trainers/dvae_trainer.py index c4def617..0d9b1de3 100644 --- a/hyperion/torch/trainers/dvae_trainer.py +++ b/hyperion/torch/trainers/dvae_trainer.py @@ -18,90 +18,95 @@ class DVAETrainer(TorchTrainer): """Denoising VAE trainer class - Attributes: - model: model object. - optim: pytorch optimizer object or optimizer options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: model object. + optim: pytorch optimizer object or optimizer options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - None, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + None, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) def train_epoch(self, data_loader): """Training epoch loop - Args: - data_loader: pytorch data loader returning noisy and clean features + Args: + data_loader: pytorch data loader returning noisy and clean features """ metric_acc = MetricAcc(device=self.device) @@ -126,9 +131,9 @@ def train_epoch(self, data_loader): with self.amp_autocast(): output = self.model(x, x_target=x_target, return_x_mean=True) - elbo = output['elbo'].mean() + elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps - x_hat = output['x_mean'] + x_hat = output["x_mean"] if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -140,20 +145,20 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['elbo'] = elbo.item() - for metric in ['log_px', 'kldiv_z']: + batch_metrics["elbo"] = elbo.item() + for metric in ["log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, x_target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -166,10 +171,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): @@ -183,12 +188,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = x.shape[0] with self.amp_autocast(): - output = self.model(x, - x_target=x_target, - return_x_mean=True) + output = self.model(x, x_target=x_target, return_x_mean=True) - x_hat = output['x_mean'] - for metric in ['elbo', 'log_px', 'kldiv_z']: + x_hat = output["x_mean"] + for metric in ["elbo", "log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, x_target) diff --git a/hyperion/torch/trainers/plda_trainer.py b/hyperion/torch/trainers/plda_trainer.py index b434ab16..4365ed56 100644 --- a/hyperion/torch/trainers/plda_trainer.py +++ b/hyperion/torch/trainers/plda_trainer.py @@ -19,35 +19,35 @@ class PLDATrainer(TorchTrainer): """Trainer to train PLDA back-end - Attributes: - model: PLDA model object. - optim: pytorch optimizer object - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - If None, it uses default loggers. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - loss: if None, it uses cross-entropy - loss_weights: dictionary with weights for multiclass and binary cross-entropies - - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: PLDA model object. + optim: pytorch optimizer object + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + loss_weights: dictionary with weights for multiclass and binary cross-entropies + + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ def __init__( @@ -117,8 +117,8 @@ def __init__( def train_epoch(self, data_loader): """Training epoch loop - Args: - data_loader: pytorch data loader returning features and class labels. + Args: + data_loader: pytorch data loader returning features and class labels. """ self.model.update_margin(self.cur_epoch) diff --git a/hyperion/torch/trainers/torch_trainer.py b/hyperion/torch/trainers/torch_trainer.py index 3a57bba8..3f9c8aca 100644 --- a/hyperion/torch/trainers/torch_trainer.py +++ b/hyperion/torch/trainers/torch_trainer.py @@ -39,32 +39,32 @@ class DDPType(str, Enum): class TorchTrainer(object): """Base Trainer class to train basic neural network models - Attributes: - model: model object. - loss: nn.Module loss class - optim: pytorch optimizer object or optimizer options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: model object. + loss: nn.Module loss class + optim: pytorch optimizer object or optimizer options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ def __init__( @@ -207,7 +207,7 @@ def __init__( def fit(self, train_data, val_data=None): """Training function, it performs the training and validation epochs - + Args: train_data: PyTorch data loader for the training loop val_data: PyTorch data loader for the validation loop @@ -416,8 +416,7 @@ def _make_lr_sched(self, lr_sched, optim): return lr_sched def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): - """Creates the default data loaders - """ + """Creates the default data loaders""" prog_log = ProgLogger(interval=log_interval) csv_log = CSVLogger(self.exp_path / "train.log", append=True) loggers = [prog_log, csv_log] @@ -434,8 +433,7 @@ def _default_loggers(self, log_interval, use_tensorboard, use_wandb, wandb): return LoggerList(loggers) def _get_lr(self): - """Returns the current learning rate to show in the loggers - """ + """Returns the current learning rate to show in the loggers""" for param_group in self.optimizer.param_groups: return param_group["lr"] @@ -558,8 +556,7 @@ def load_checkpoint(self, file_path): return logs def load_last_checkpoint(self): - """Loads the last training checkpoint in the experiment dir. - """ + """Loads the last training checkpoint in the experiment dir.""" for epoch in range(self.epochs, 0, -1): file_path = "%s/model_ep%04d.pth" % (self.exp_path, epoch) if os.path.isfile(file_path): diff --git a/hyperion/torch/trainers/vae_trainer.py b/hyperion/torch/trainers/vae_trainer.py index f5ede0c6..53486c7b 100644 --- a/hyperion/torch/trainers/vae_trainer.py +++ b/hyperion/torch/trainers/vae_trainer.py @@ -18,84 +18,89 @@ class VAETrainer(TorchTrainer): """Variational Auto-encoder trainer class - Attributes: - model: model object. - optim: pytorch optimizer object or optimizer options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: model object. + optim: pytorch optimizer object or optimizer options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - None, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + None, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) def train_epoch(self, data_loader): @@ -118,9 +123,9 @@ def train_epoch(self, data_loader): with self.amp_autocast(): output = self.model(data, return_x_mean=True) - elbo = output['elbo'].mean() + elbo = output["elbo"].mean() loss = -elbo / self.grad_acc_steps - x_hat = output['x_mean'] + x_hat = output["x_mean"] if self.use_amp: self.grad_scaler.scale(loss).backward() @@ -132,20 +137,20 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['elbo'] = elbo.item() - for metric in ['log_px', 'kldiv_z']: + batch_metrics["elbo"] = elbo.item() + for metric in ["log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, data) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -154,10 +159,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): @@ -170,8 +175,8 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with self.amp_autocast(): output = self.model(data, return_x_mean=True) - x_hat = output['x_mean'] - for metric in ['elbo', 'log_px', 'kldiv_z']: + x_hat = output["x_mean"] + for metric in ["elbo", "log_px", "kldiv_z"]: batch_metrics[metric] = output[metric].mean().item() for k, metric in self.metrics.items(): diff --git a/hyperion/torch/trainers/vq_dvae_trainer.py b/hyperion/torch/trainers/vq_dvae_trainer.py index 2d41cd65..a2da616c 100644 --- a/hyperion/torch/trainers/vq_dvae_trainer.py +++ b/hyperion/torch/trainers/vq_dvae_trainer.py @@ -18,83 +18,88 @@ class VQDVAETrainer(DVAETrainer): """Vector Quantized Variational Auto-encoder trainer class - Attributes: - model: model object. - optim: pytorch optimizer object or optimizer options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: model object. + optim: pytorch optimizer object or optimizer options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) def train_epoch(self, data_loader): @@ -119,8 +124,8 @@ def train_epoch(self, data_loader): with self.amp_autocast(): output = self.model(x, x_target=x_target, return_x_mean=True) - loss = output['loss'] - x_hat = output['x_mean'] + loss = output["loss"] + x_hat = output["x_mean"] loss = loss.mean() / self.grad_acc_steps if self.use_amp: @@ -133,22 +138,23 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps - for metric in ['elbo', 'log_px', 'kldiv_z', 'vq_loss']: + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + for metric in ["elbo", "log_px", "kldiv_z", "vq_loss"]: batch_metrics[metric] = output[metric].mean().item() - batch_metrics['perplexity'] = math.exp( - output['log_perplexity'].mean().item()) + batch_metrics["perplexity"] = math.exp( + output["log_perplexity"].mean().item() + ) for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, x_target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -157,10 +163,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train' + log_tag = "train" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): @@ -174,15 +180,14 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_size = x.shape[0] with self.amp_autocast(): - output = self.model(x, - x_target=x_target, - return_x_mean=True) + output = self.model(x, x_target=x_target, return_x_mean=True) - x_hat = output['x_mean'] - for metric in ['loss', 'elbo', 'log_px', 'kldiv_z', 'vq_loss']: + x_hat = output["x_mean"] + for metric in ["loss", "elbo", "log_px", "kldiv_z", "vq_loss"]: batch_metrics[metric] = output[metric].mean().item() - batch_metrics['perplexity'] = math.exp( - output['log_perplexity'].mean().item()) + batch_metrics["perplexity"] = math.exp( + output["log_perplexity"].mean().item() + ) for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, x_target) diff --git a/hyperion/torch/trainers/vq_vae_trainer.py b/hyperion/torch/trainers/vq_vae_trainer.py index 199253ca..d187af79 100644 --- a/hyperion/torch/trainers/vq_vae_trainer.py +++ b/hyperion/torch/trainers/vq_vae_trainer.py @@ -18,83 +18,88 @@ class VQVAETrainer(VAETrainer): """Vector Quantized Variational Auto-encoder trainer class - Attributes: - model: model object. - optim: pytorch optimizer object or optimizer options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: model object. + optim: pytorch optimizer object or optimizer options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) def train_epoch(self, data_loader): @@ -119,8 +124,8 @@ def train_epoch(self, data_loader): with self.amp_autocast(): output = self.model(x, return_x_mean=True) - loss = output['loss'] - x_hat = output['x_mean'] + loss = output["loss"] + x_hat = output["x_mean"] loss = loss.mean() / self.grad_acc_steps if self.use_amp: @@ -133,22 +138,23 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps - for metric in ['elbo', 'log_px', 'kldiv_z', 'vq_loss']: + batch_metrics["loss"] = loss.item() * self.grad_acc_steps + for metric in ["elbo", "log_px", "kldiv_z", "vq_loss"]: batch_metrics[metric] = output[metric].mean().item() - batch_metrics['perplexity'] = math.exp( - output['log_perplexity'].mean().item()) + batch_metrics["perplexity"] = math.exp( + output["log_perplexity"].mean().item() + ) for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, x) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -157,10 +163,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, data in enumerate(data_loader): @@ -176,11 +182,12 @@ def validation_epoch(self, data_loader, swa_update_bn=False): with self.amp_autocast(): output = self.model(x, return_x_mean=True) - x_hat = output['x_mean'] - for metric in ['loss', 'elbo', 'log_px', 'kldiv_z', 'vq_loss']: + x_hat = output["x_mean"] + for metric in ["loss", "elbo", "log_px", "kldiv_z", "vq_loss"]: batch_metrics[metric] = output[metric].mean().item() - batch_metrics['perplexity'] = math.exp( - output['log_perplexity'].mean().item()) + batch_metrics["perplexity"] = math.exp( + output["log_perplexity"].mean().item() + ) for k, metric in self.metrics.items(): batch_metrics[k] = metric(x_hat, x) diff --git a/hyperion/torch/trainers/xvector_adv_trainer.py b/hyperion/torch/trainers/xvector_adv_trainer.py index fe866a48..0784a2ea 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer.py +++ b/hyperion/torch/trainers/xvector_adv_trainer.py @@ -18,91 +18,96 @@ class XVectorAdvTrainer(XVectorTrainer): """Adversarial Training of x-vectors with attack in feature domain - Attributes: - model: x-Vector model object. - attack: adv. attack generator object - optim: pytorch optimizer object or options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - p_attack: attack probability - p_val_attack: attack probability in validation - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lr_scheduler: learning rate scheduler object - loggers: LoggerList object, loggers write training progress to std. output and file. - If None, it uses default loggers. - data_parallel: if True use nn.DataParallel - loss: if None, it uses cross-entropy - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: x-Vector model object. + attack: adv. attack generator object + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + p_attack: attack probability + p_val_attack: attack probability in validation + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lr_scheduler: learning rate scheduler object + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + data_parallel: if True use nn.DataParallel + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - attack, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - p_attack=0.8, - p_val_attack=0, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - loss=None, - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + attack, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + p_attack=0.8, + p_val_attack=0, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + loss=loss, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) self.attack = attack self.attack.to(device) @@ -110,11 +115,15 @@ def __init__(self, self.p_val_attack = p_val_attack if self.p_attack > 1: logging.warning( - ('p-attack(%f) cannot be larger than 1./grad-acc-steps (%f)' - 'because we can only create adv. signals in the ' - 'first step of the gradient acc. loop given that' - 'adv optimization over-writes the gradients ' - 'stored in the model') % (p_attack, 1. / self.grad_acc_steps)) + ( + "p-attack(%f) cannot be larger than 1./grad-acc-steps (%f)" + "because we can only create adv. signals in the " + "first step of the gradient acc. loop given that" + "adv optimization over-writes the gradients " + "stored in the model" + ) + % (p_attack, 1.0 / self.grad_acc_steps) + ) # if data_parallel: # # change model in attack by the data parallel version @@ -139,13 +148,11 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: if torch.rand(1) < self.p_attack: # generate adversarial attacks - logging.info('generating adv attack for batch=%d' % - (batch)) + logging.info("generating adv attack for batch=%d" % (batch)) self.model.eval() data_adv = self.attack.generate(data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() - logging.info('adv attack max perturbation=%f' % - (max_delta)) + logging.info("adv attack max perturbation=%f" % (max_delta)) data = data_adv self.set_train_mode() @@ -165,18 +172,18 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -185,10 +192,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, (data, target) in enumerate(data_loader): @@ -207,7 +214,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): output = self.model(data, **self.amp_args) loss = self.loss(output, target) - batch_metrics['loss'] = loss.mean().item() + batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -220,7 +227,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def filter_args(**kwargs): args = XVectorTrainer.filter_args(**kwargs) - valid_args = ('p_attack', 'p_val_attack') + valid_args = ("p_attack", "p_val_attack") args_1 = dict((k, kwargs[k]) for k in valid_args if k in kwargs) args.update(args_1) return args @@ -229,20 +236,22 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") XVectorTrainer.add_class_args(parser, skip=skip) - parser.add_argument('--p-attack', - default=0.5, - type=float, - help='ratio of batches with adv attack') parser.add_argument( - '--p-val-attack', - default=0., + "--p-attack", + default=0.5, + type=float, + help="ratio of batches with adv attack", + ) + parser.add_argument( + "--p-val-attack", + default=0.0, type=float, - help='ratio of batches with adv attack in validation') + help="ratio of batches with adv attack in validation", + ) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='trainer options') diff --git a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py index 085b4128..75c3ece8 100644 --- a/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_adv_trainer_from_wav.py @@ -11,101 +11,106 @@ import torch import torch.nn as nn -from ..utils import MetricAcc #, TorchDataParallel +from ..utils import MetricAcc # , TorchDataParallel from .xvector_trainer_from_wav import XVectorTrainerFromWav class XVectorAdvTrainerFromWav(XVectorTrainerFromWav): """Adversarial Training of x-vectors with attack in feature domain - Attributes: - model: x-Vector model object. - feat_extractor: feature extractor nn.Module - attack: adv. attack generator object - optim: pytorch optimizer object or options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - p_attack: attack probability - p_val_attack: attack probability in validation - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object or options dict - loggers: LoggerList object, loggers write training progress to std. output and file. - If None, it uses default loggers. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - loss: if None, it uses cross-entropy - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: x-Vector model object. + feat_extractor: feature extractor nn.Module + attack: adv. attack generator object + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + p_attack: attack probability + p_val_attack: attack probability in validation + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - feat_extractor, - attack, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - p_attack=0.8, - p_val_attack=0, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - loss=None, - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - feat_extractor, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + feat_extractor, + attack, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + p_attack=0.8, + p_val_attack=0, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + feat_extractor, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + loss=loss, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) self.attack = attack self.attack.to(device) @@ -113,11 +118,15 @@ def __init__(self, self.p_val_attack = p_val_attack if self.p_attack > 1: logging.warning( - ('p-attack(%f) cannot be larger than 1./grad-acc-steps (%f)' - 'because we can only create adv. signals in the ' - 'first step of the gradient acc. loop given that' - 'adv optimization over-writes the gradients ' - 'stored in the model') % (p_attack, 1. / self.grad_acc_steps)) + ( + "p-attack(%f) cannot be larger than 1./grad-acc-steps (%f)" + "because we can only create adv. signals in the " + "first step of the gradient acc. loop given that" + "adv optimization over-writes the gradients " + "stored in the model" + ) + % (p_attack, 1.0 / self.grad_acc_steps) + ) # if data_parallel: # # change model in attack by the data parallel version @@ -142,13 +151,13 @@ def train_epoch(self, data_loader): if batch % self.grad_acc_steps == 0: if torch.rand(1) < self.p_attack: # generate adversarial attacks - #logging.info('generating adv attack for batch=%d' % (batch)) + # logging.info('generating adv attack for batch=%d' % (batch)) self.model.eval() data_adv = self.attack.generate(data, target) max_delta = torch.max(torch.abs(data_adv - data)).item() - #z = torch.abs(data_adv-data) > 100 - #logging.info('zz {} {}'.format(data[z], data_adv[z])) - #logging.info('adv attack max perturbation=%f' % (max_delta)) + # z = torch.abs(data_adv-data) > 100 + # logging.info('zz {} {}'.format(data[z], data_adv[z])) + # logging.info('adv attack max perturbation=%f' % (max_delta)) data = data_adv self.set_train_mode() @@ -171,18 +180,18 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -191,10 +200,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, (data, target) in enumerate(data_loader): @@ -214,7 +223,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): output = self.model(feats) loss = self.loss(output, target) - batch_metrics['loss'] = loss.mean().item() + batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) @@ -227,7 +236,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): @staticmethod def filter_args(**kwargs): args = XVectorTrainerFromWav.filter_args(**kwargs) - valid_args = ('p_attack', 'p_val_attack') + valid_args = ("p_attack", "p_val_attack") args_1 = dict((k, kwargs[k]) for k in valid_args if k in kwargs) args.update(args_1) return args @@ -236,20 +245,22 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") XVectorTrainerFromWav.add_class_args(parser, skip=skip) - parser.add_argument('--p-attack', - default=0.5, - type=float, - help='ratio of batches with adv attack') parser.add_argument( - '--p-val-attack', - default=0., + "--p-attack", + default=0.5, + type=float, + help="ratio of batches with adv attack", + ) + parser.add_argument( + "--p-val-attack", + default=0.0, type=float, - help='ratio of batches with adv attack in validation') + help="ratio of batches with adv attack in validation", + ) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='trainer options') diff --git a/hyperion/torch/trainers/xvector_finetuner.py b/hyperion/torch/trainers/xvector_finetuner.py index 7df53796..cf833257 100644 --- a/hyperion/torch/trainers/xvector_finetuner.py +++ b/hyperion/torch/trainers/xvector_finetuner.py @@ -16,52 +16,72 @@ class XVectorFinetuner(XVectorTrainer): - - def __init__(self, model, optimizer, epochs, exp_path, cur_epoch=0, - grad_acc_steps=1, - device=None, metrics=None, lr_scheduler=None, loggers=None, - data_parallel=False, loss=None, finetune_mode='ft-embed-affine'): + def __init__( + self, + model, + optimizer, + epochs, + exp_path, + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lr_scheduler=None, + loggers=None, + data_parallel=False, + loss=None, + finetune_mode="ft-embed-affine", + ): super(XVectorFinetuner, self).__init__( - model, optimizer, epochs, exp_path, cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, device=device, metrics=metrics, - lr_scheduler=lr_scheduler, loggers=loggers, data_parallel=data_parallel, loss=loss) + model, + optimizer, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lr_scheduler=lr_scheduler, + loggers=loggers, + data_parallel=data_parallel, + loss=loss, + ) self.finetune_mode = finetune_mode - def train_epoch(self, data_loader): - #epoch_batches = len(data_loader.dataset) - #total_batches = self.cur_epoch * epoch_batches - + # epoch_batches = len(data_loader.dataset) + # total_batches = self.cur_epoch * epoch_batches + self.model.update_loss_margin(self.cur_epoch) metric_acc = MetricAcc() batch_metrics = ODict() - #self.model.train_mode(self.finetune_mode) + # self.model.train_mode(self.finetune_mode) self.model.eval() for batch, (data, target) in enumerate(data_loader): self.loggers.on_batch_begin(batch) - + if batch % self.grad_acc_steps == 0: self.optimizer.zero_grad() - + data, target = data.to(self.device), target.to(self.device) batch_size = data.shape[0] output = self.model(data, target) - loss = self.loss(output, target).mean()/self.grad_acc_steps + loss = self.loss(output, target).mean() / self.grad_acc_steps loss.backward() - if (batch+1) % self.grad_acc_steps == 0: + if (batch + 1) % self.grad_acc_steps == 0: if self.lr_scheduler is not None: self.lr_scheduler.on_opt_step() self.optimizer.step() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) - + # logging.info('batch={} shape={} loss={} acc={}'.format(batch,data.shape, batch_metrics['loss'], batch_metrics['acc'])) # if batch > 63: @@ -85,17 +105,13 @@ def train_epoch(self, data_loader): # #logging.info(str(torch.sum(torch.isnan(data)))) # #logging.info(str(torch.sum(torch.isnan(target)))) # #logging.info(str(torch.sum(torch.isnan(output)))) - + metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - #total_batches +=1 + # total_batches +=1 logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() return logs - - - - diff --git a/hyperion/torch/trainers/xvector_trainer.py b/hyperion/torch/trainers/xvector_trainer.py index e57471d5..190b2a30 100644 --- a/hyperion/torch/trainers/xvector_trainer.py +++ b/hyperion/torch/trainers/xvector_trainer.py @@ -17,33 +17,33 @@ class XVectorTrainer(TorchTrainer): """Trainer to train x-vector style models. - Attributes: - model: x-Vector model object. - optim: pytorch optimizer object or options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object or options dict - loggers: LoggerList object, loggers write training progress to std. output and file. - If None, it uses default loggers. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - loss: if None, it uses cross-entropy - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: x-Vector model object. + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict + loggers: LoggerList object, loggers write training progress to std. output and file. + If None, it uses default loggers. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ def __init__( @@ -108,8 +108,8 @@ def __init__( def train_epoch(self, data_loader): """Training epoch loop - Args: - data_loader: pytorch data loader returning features and class labels. + Args: + data_loader: pytorch data loader returning features and class labels. """ self.model.update_loss_margin(self.cur_epoch) diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py index 31180f9c..7b7cb21c 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn -from ..utils import MetricAcc #, TorchDataParallel +from ..utils import MetricAcc # , TorchDataParallel from .xvector_trainer import XVectorTrainer # class DFRModelWrapper(nn.Module): @@ -38,102 +38,107 @@ class XVectorTrainerDeepFeatReg(XVectorTrainer): """Trainer to train x-vector style models. - Attributes: - model: x-Vector model object that we want to fine-tune - prior_model: x-Vector model object that we use as regularizer - optim: pytorch optimizer object or options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - reg_layers_enc: list of encoder layer indexes that we use for regularization - reg_layers_classif: list of classification head layer indexes that we use for regularization - reg_weight_enc: weight of the regularization loss for encoder hidden activations - reg_weight_classif: weight of the regularization loss for classification head hidden activations - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object or options dict. - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - loss: if None, it uses cross-entropy - reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: x-Vector model object that we want to fine-tune + prior_model: x-Vector model object that we use as regularizer + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + reg_layers_enc: list of encoder layer indexes that we use for regularization + reg_layers_classif: list of classification head layer indexes that we use for regularization + reg_weight_enc: weight of the regularization loss for encoder hidden activations + reg_weight_classif: weight of the regularization loss for classification head hidden activations + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict. + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - prior_model, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - reg_layers_enc=None, - reg_layers_classif=None, - reg_weight_enc=0.1, - reg_weight_classif=0.1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - loss=None, - reg_loss=None, - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + prior_model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + reg_layers_enc=None, + reg_layers_classif=None, + reg_weight_enc=0.1, + reg_weight_classif=0.1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + reg_loss=None, + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + loss=loss, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) self.prior_model = prior_model - if reg_loss is None or reg_loss == 'l1': + if reg_loss is None or reg_loss == "l1": reg_loss = nn.L1Loss() - elif reg_loss == 'mse': + elif reg_loss == "mse": reg_loss = nn.MSELoss() self.reg_loss = reg_loss self.reg_layers_enc = reg_layers_enc @@ -182,26 +187,35 @@ def train_epoch(self, data_loader): # h_enc, h_classif, output = self.model_wrapper( # data, target, self.reg_layers_enc, self.reg_layers_classif, # return_output=True, **self.amp_args) - outputs = self.model(data, - target, - self.reg_layers_enc, - self.reg_layers_classif, - return_output=True) - h_enc, h_classif, output = (outputs['h_enc'], - outputs['h_classif'], - outputs['output']) - - loss = self.loss(output, target).mean( - ) # you need to take the mean here because of the multi-gpu training - batch_metrics['loss-classif'] = loss.item() - - prior_outputs = self.prior_model(data, - target, - self.reg_layers_enc, - self.reg_layers_classif, - return_output=False) - prior_h_enc, prior_h_classif = (prior_outputs['h_enc'], - prior_outputs['h_classif']) + outputs = self.model( + data, + target, + self.reg_layers_enc, + self.reg_layers_classif, + return_output=True, + ) + h_enc, h_classif, output = ( + outputs["h_enc"], + outputs["h_classif"], + outputs["output"], + ) + + loss = self.loss( + output, target + ).mean() # you need to take the mean here because of the multi-gpu training + batch_metrics["loss-classif"] = loss.item() + + prior_outputs = self.prior_model( + data, + target, + self.reg_layers_enc, + self.reg_layers_classif, + return_output=False, + ) + prior_h_enc, prior_h_classif = ( + prior_outputs["h_enc"], + prior_outputs["h_classif"], + ) n_enc = len(h_enc) if n_enc > 0: @@ -209,7 +223,7 @@ def train_epoch(self, data_loader): for i in range(n_enc): l = self.reg_layers_enc[i] loss_i = self.reg_loss(h_enc[i], prior_h_enc[i]).mean() - loss_name = 'reg-h-enc-%d' % l + loss_name = "reg-h-enc-%d" % l batch_metrics[loss_name] = loss_i.item() loss += loss_scale * loss_i @@ -218,13 +232,12 @@ def train_epoch(self, data_loader): loss_scale = self.reg_weight_classif / n_classif for i in range(n_classif): l = self.reg_layers_classif[i] - loss_i = self.reg_loss(h_classif[i], - prior_h_classif[i]).mean() - loss_name = 'reg-h-classif-%d' % l + loss_i = self.reg_loss(h_classif[i], prior_h_classif[i]).mean() + loss_name = "reg-h-classif-%d" % l batch_metrics[loss_name] = loss_i.item() loss += loss_scale * loss_i - batch_metrics['loss'] = loss.item() + batch_metrics["loss"] = loss.item() loss = loss / self.grad_acc_steps if self.use_amp: @@ -242,20 +255,25 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) - #total_batches +=1 + # total_batches +=1 logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() return logs @staticmethod def filter_args(**kwargs): args = XVectorTrainer.filter_args(**kwargs) - valid_args = ('reg_layers_enc', 'reg_layers_classif', 'reg_weight_enc', - 'reg_weight_classif', 'reg_loss') + valid_args = ( + "reg_layers_enc", + "reg_layers_classif", + "reg_weight_enc", + "reg_weight_classif", + "reg_loss", + ) args_1 = dict((k, kwargs[k]) for k in valid_args if k in kwargs) args.update(args_1) return args @@ -264,37 +282,41 @@ def filter_args(**kwargs): def add_class_args(parser, prefix=None, skip=[]): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') + parser = ArgumentParser(prog="") XVectorTrainer.add_class_args(parser, skip=skip) parser.add_argument( - '--reg-layers-enc', + "--reg-layers-enc", type=int, default=None, - nargs='+', - help= - 'list of layers from the encoder nnet to use for regularization ') + nargs="+", + help="list of layers from the encoder nnet to use for regularization ", + ) parser.add_argument( - '--reg-layers-classif', + "--reg-layers-classif", type=int, default=None, - nargs='+', - help= - 'list of layers from the classif nnet to use for regularization ') - parser.add_argument('--reg-weight-enc', - type=float, - default=0.1, - help='weight for regularization from enc layers') + nargs="+", + help="list of layers from the classif nnet to use for regularization ", + ) parser.add_argument( - '--reg-weight-classif', + "--reg-weight-enc", type=float, default=0.1, - help='weight for regularization from classif layers') - parser.add_argument('--reg-loss', - default='l1', - choices=['l1', 'mse'], - help=('type of regularization loss')) + help="weight for regularization from enc layers", + ) + parser.add_argument( + "--reg-weight-classif", + type=float, + default=0.1, + help="weight for regularization from classif layers", + ) + parser.add_argument( + "--reg-loss", + default="l1", + choices=["l1", "mse"], + help=("type of regularization loss"), + ) if prefix is not None: - outer_parser.add_argument('--' + prefix, - action=ActionParser(parser=parser)) + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) # help='trainer options') diff --git a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py index 80569a26..29964322 100644 --- a/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_deep_feat_reg_from_wav.py @@ -10,7 +10,7 @@ import torch import torch.nn as nn -from ..utils import MetricAcc #, TorchDataParallel +from ..utils import MetricAcc # , TorchDataParallel from .torch_trainer import TorchTrainer from .xvector_trainer_deep_feat_reg import XVectorTrainerDeepFeatReg @@ -18,104 +18,109 @@ class XVectorTrainerDeepFeatRegFromWav(XVectorTrainerDeepFeatReg): """Trainer to train x-vector style models. - Attributes: - model: x-Vector model object that we want to fine-tune - feat_extractor: feature extractor nn.Module - prior_model: x-Vector model object that we use as regularizer - optim: pytorch optimizer object or options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - reg_layers_enc: list of encoder layer indexes that we use for regularization - reg_layers_classif: list of classification head layer indexes that we use for regularization - reg_weight_enc: weight of the regularization loss for encoder hidden activations - reg_weight_classif: weight of the regularization loss for classification head hidden activations - device: cpu/gpu device - lrsched: learning rate scheduler object or options dict. - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - loss: if None, it uses cross-entropy - reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: x-Vector model object that we want to fine-tune + feat_extractor: feature extractor nn.Module + prior_model: x-Vector model object that we use as regularizer + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + reg_layers_enc: list of encoder layer indexes that we use for regularization + reg_layers_classif: list of classification head layer indexes that we use for regularization + reg_weight_enc: weight of the regularization loss for encoder hidden activations + reg_weight_classif: weight of the regularization loss for classification head hidden activations + device: cpu/gpu device + lrsched: learning rate scheduler object or options dict. + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + reg_loss: nn.Module loss used for regularization, if None it uses L1 loss. + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - feat_extractor, - prior_model, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - reg_layers_enc=None, - reg_layers_classif=None, - reg_weight_enc=0.1, - reg_weight_classif=0.1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - loss=None, - reg_loss=None, - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - prior_model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - reg_layers_enc=reg_layers_enc, - reg_layers_classif=reg_layers_classif, - reg_weight_enc=reg_weight_enc, - reg_weight_classif=reg_weight_classif, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - reg_loss=reg_loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + feat_extractor, + prior_model, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + reg_layers_enc=None, + reg_layers_classif=None, + reg_weight_enc=0.1, + reg_weight_classif=0.1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + reg_loss=None, + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + prior_model, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + reg_layers_enc=reg_layers_enc, + reg_layers_classif=reg_layers_classif, + reg_weight_enc=reg_weight_enc, + reg_weight_classif=reg_weight_classif, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + loss=loss, + reg_loss=reg_loss, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) self.feat_extractor = feat_extractor if device is not None: @@ -152,29 +157,38 @@ def train_epoch(self, data_loader): # h_enc, h_classif, output = self.model_wrapper( # feats, target, self.reg_layers_enc, self.reg_layers_classif, # return_output=True, **self.amp_args) - outputs = self.model(feats, - target, - self.reg_layers_enc, - self.reg_layers_classif, - return_output=True) - h_enc, h_classif, output = (outputs['h_enc'], - outputs['h_classif'], - outputs['output']) - - loss = self.loss(output, target).mean( - ) # you need to take the mean here because of the multi-gpu training - batch_metrics['loss-classif'] = loss.item() + outputs = self.model( + feats, + target, + self.reg_layers_enc, + self.reg_layers_classif, + return_output=True, + ) + h_enc, h_classif, output = ( + outputs["h_enc"], + outputs["h_classif"], + outputs["output"], + ) + + loss = self.loss( + output, target + ).mean() # you need to take the mean here because of the multi-gpu training + batch_metrics["loss-classif"] = loss.item() # prior_h_enc, prior_h_classif = self.prior_model_wrapper( # feats, target, self.reg_layers_enc, self.reg_layers_classif, # return_output=False, **self.amp_args) - prior_outputs = self.prior_model(feats, - target, - self.reg_layers_enc, - self.reg_layers_classif, - return_output=False) - prior_h_enc, prior_h_classif = (prior_outputs['h_enc'], - prior_outputs['h_classif']) + prior_outputs = self.prior_model( + feats, + target, + self.reg_layers_enc, + self.reg_layers_classif, + return_output=False, + ) + prior_h_enc, prior_h_classif = ( + prior_outputs["h_enc"], + prior_outputs["h_classif"], + ) n_enc = len(h_enc) if n_enc > 0: @@ -182,7 +196,7 @@ def train_epoch(self, data_loader): for i in range(n_enc): l = self.reg_layers_enc[i] loss_i = self.reg_loss(h_enc[i], prior_h_enc[i]).mean() - loss_name = 'reg-h-enc-%d' % l + loss_name = "reg-h-enc-%d" % l batch_metrics[loss_name] = loss_i.item() loss += loss_scale * loss_i @@ -191,13 +205,12 @@ def train_epoch(self, data_loader): loss_scale = self.reg_weight_classif / n_classif for i in range(n_classif): l = self.reg_layers_classif[i] - loss_i = self.reg_loss(h_classif[i], - prior_h_classif[i]).mean() - loss_name = 'reg-h-classif-%d' % l + loss_i = self.reg_loss(h_classif[i], prior_h_classif[i]).mean() + loss_name = "reg-h-classif-%d" % l batch_metrics[loss_name] = loss_i.item() loss += loss_scale * loss_i - batch_metrics['loss'] = loss.item() + batch_metrics["loss"] = loss.item() loss = loss / self.grad_acc_steps if self.use_amp: @@ -215,12 +228,12 @@ def train_epoch(self, data_loader): metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -233,10 +246,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, (data, target) in enumerate(data_loader): @@ -248,7 +261,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): output = self.model(feats) loss = self.loss(output, target) - batch_metrics['loss'] = loss.mean().item() + batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) diff --git a/hyperion/torch/trainers/xvector_trainer_from_wav.py b/hyperion/torch/trainers/xvector_trainer_from_wav.py index b2e77c72..06086d32 100644 --- a/hyperion/torch/trainers/xvector_trainer_from_wav.py +++ b/hyperion/torch/trainers/xvector_trainer_from_wav.py @@ -17,87 +17,92 @@ class XVectorTrainerFromWav(XVectorTrainer): """Trainer to train x-vector style models. - Attributes: - model: x-Vector model object. - feat_extractor: feature extractor nn.Module - optim: pytorch optimizer object or options dict - epochs: max. number of epochs - exp_path: experiment output path - cur_epoch: current epoch - grad_acc_steps: gradient accumulation steps to simulate larger batch size. - device: cpu/gpu device - metrics: extra metrics to compute besides cxe. - lrsched: learning rate scheduler object or options dict. - loggers: LoggerList object, loggers write training progress to std. output and file. - ddp: if True use distributed data parallel training - ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) - loss: if None, it uses cross-entropy - train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] - use_amp: uses mixed precision training. - log_interval: number of optim. steps between log outputs - use_tensorboard: use tensorboard logger - use_wandb: use wandb logger - wandb: wandb dictionary of options - grad_clip: norm to clip gradients, if 0 there is no clipping - grad_clip_norm: norm type to clip gradients - swa_start: epoch to start doing swa - swa_lr: SWA learning rate - swa_anneal_epochs: SWA learning rate anneal epochs - cpu_offload: CPU offload of gradients when using fully sharded ddp + Attributes: + model: x-Vector model object. + feat_extractor: feature extractor nn.Module + optim: pytorch optimizer object or options dict + epochs: max. number of epochs + exp_path: experiment output path + cur_epoch: current epoch + grad_acc_steps: gradient accumulation steps to simulate larger batch size. + device: cpu/gpu device + metrics: extra metrics to compute besides cxe. + lrsched: learning rate scheduler object or options dict. + loggers: LoggerList object, loggers write training progress to std. output and file. + ddp: if True use distributed data parallel training + ddp_type: type of distributed data parallel in (ddp, oss_ddp, oss_shared_ddp) + loss: if None, it uses cross-entropy + train_mode: training mode in ['train', 'ft-full', 'ft-last-layer'] + use_amp: uses mixed precision training. + log_interval: number of optim. steps between log outputs + use_tensorboard: use tensorboard logger + use_wandb: use wandb logger + wandb: wandb dictionary of options + grad_clip: norm to clip gradients, if 0 there is no clipping + grad_clip_norm: norm type to clip gradients + swa_start: epoch to start doing swa + swa_lr: SWA learning rate + swa_anneal_epochs: SWA learning rate anneal epochs + cpu_offload: CPU offload of gradients when using fully sharded ddp """ - def __init__(self, - model, - feat_extractor, - optim={}, - epochs=100, - exp_path='./train', - cur_epoch=0, - grad_acc_steps=1, - device=None, - metrics=None, - lrsched=None, - loggers=None, - ddp=False, - ddp_type='ddp', - loss=None, - train_mode='train', - use_amp=False, - log_interval=10, - use_tensorboard=False, - use_wandb=False, - wandb={}, - grad_clip=0, - grad_clip_norm=2, - swa_start=0, - swa_lr=1e-3, - swa_anneal_epochs=10, - cpu_offload=False): - - super().__init__(model, - optim, - epochs, - exp_path, - cur_epoch=cur_epoch, - grad_acc_steps=grad_acc_steps, - device=device, - metrics=metrics, - lrsched=lrsched, - loggers=loggers, - ddp=ddp, - ddp_type=ddp_type, - loss=loss, - train_mode=train_mode, - use_amp=use_amp, - log_interval=log_interval, - use_tensorboard=use_tensorboard, - use_wandb=use_wandb, - wandb=wandb, - grad_clip=grad_clip, - grad_clip_norm=grad_clip_norm, - swa_start=swa_start, - swa_lr=swa_lr, - swa_anneal_epochs=swa_anneal_epochs, - cpu_offload=cpu_offload) + + def __init__( + self, + model, + feat_extractor, + optim={}, + epochs=100, + exp_path="./train", + cur_epoch=0, + grad_acc_steps=1, + device=None, + metrics=None, + lrsched=None, + loggers=None, + ddp=False, + ddp_type="ddp", + loss=None, + train_mode="train", + use_amp=False, + log_interval=10, + use_tensorboard=False, + use_wandb=False, + wandb={}, + grad_clip=0, + grad_clip_norm=2, + swa_start=0, + swa_lr=1e-3, + swa_anneal_epochs=10, + cpu_offload=False, + ): + + super().__init__( + model, + optim, + epochs, + exp_path, + cur_epoch=cur_epoch, + grad_acc_steps=grad_acc_steps, + device=device, + metrics=metrics, + lrsched=lrsched, + loggers=loggers, + ddp=ddp, + ddp_type=ddp_type, + loss=loss, + train_mode=train_mode, + use_amp=use_amp, + log_interval=log_interval, + use_tensorboard=use_tensorboard, + use_wandb=use_wandb, + wandb=wandb, + grad_clip=grad_clip, + grad_clip_norm=grad_clip_norm, + swa_start=swa_start, + swa_lr=swa_lr, + swa_anneal_epochs=swa_anneal_epochs, + cpu_offload=cpu_offload, + ) self.feat_extractor = feat_extractor if device is not None: @@ -109,8 +114,8 @@ def __init__(self, def train_epoch(self, data_loader): """Training epoch loop - Args: - data_loader: pytorch data loader returning features and class labels. + Args: + data_loader: pytorch data loader returning features and class labels. """ self.model.update_loss_margin(self.cur_epoch) @@ -143,18 +148,18 @@ def train_epoch(self, data_loader): self.lr_scheduler.on_opt_step() self.update_model() - batch_metrics['loss'] = loss.item() * self.grad_acc_steps + batch_metrics["loss"] = loss.item() * self.grad_acc_steps for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) metric_acc.update(batch_metrics, batch_size) logs = metric_acc.metrics - logs['lr'] = self._get_lr() + logs["lr"] = self._get_lr() self.loggers.on_batch_end(logs=logs, batch_size=batch_size) logs = metric_acc.metrics - logs = ODict(('train_' + k, v) for k, v in logs.items()) - logs['lr'] = self._get_lr() + logs = ODict(("train_" + k, v) for k, v in logs.items()) + logs["lr"] = self._get_lr() return logs def validation_epoch(self, data_loader, swa_update_bn=False): @@ -167,10 +172,10 @@ def validation_epoch(self, data_loader, swa_update_bn=False): batch_metrics = ODict() with torch.no_grad(): if swa_update_bn: - log_tag = 'train_' + log_tag = "train_" self.set_train_mode() else: - log_tag = 'val_' + log_tag = "val_" self.model.eval() for batch, (data, target) in enumerate(data_loader): @@ -182,7 +187,7 @@ def validation_epoch(self, data_loader, swa_update_bn=False): output = self.model(feats, **self.amp_args) loss = self.loss(output, target) - batch_metrics['loss'] = loss.mean().item() + batch_metrics["loss"] = loss.mean().item() for k, metric in self.metrics.items(): batch_metrics[k] = metric(output, target) diff --git a/hyperion/torch/transforms/__init__.py b/hyperion/torch/transforms/__init__.py index 92ea44ed..a7432bab 100644 --- a/hyperion/torch/transforms/__init__.py +++ b/hyperion/torch/transforms/__init__.py @@ -5,4 +5,3 @@ from .reshape import Reshape - diff --git a/hyperion/torch/transforms/reshape.py b/hyperion/torch/transforms/reshape.py index 8ffa7655..c7a03e74 100644 --- a/hyperion/torch/transforms/reshape.py +++ b/hyperion/torch/transforms/reshape.py @@ -5,12 +5,10 @@ import torch -class Reshape(object): +class Reshape(object): def __init__(self, shape): self.shape = shape def __call__(self, x): return torch.reshape(x, shape=self.shape) - - diff --git a/hyperion/torch/utils/__init__.py b/hyperion/torch/utils/__init__.py index 0e68827d..6db39ef3 100644 --- a/hyperion/torch/utils/__init__.py +++ b/hyperion/torch/utils/__init__.py @@ -8,6 +8,3 @@ from .eval_utils import eval_nnet_by_chunks, eval_nnet_overlap_add from .data_parallel import TorchDataParallel from .ddp import TorchDDP, FairShardedDDP, FairFullyShardedDDP - - - diff --git a/hyperion/torch/utils/data_parallel.py b/hyperion/torch/utils/data_parallel.py index 8b1feccc..9ec9dba0 100644 --- a/hyperion/torch/utils/data_parallel.py +++ b/hyperion/torch/utils/data_parallel.py @@ -6,6 +6,7 @@ import torch import torch.nn as nn + class TorchDataParallel(nn.DataParallel): def __getattr__(self, name): try: diff --git a/hyperion/torch/utils/ddp.py b/hyperion/torch/utils/ddp.py index 9d62bf68..400704a5 100644 --- a/hyperion/torch/utils/ddp.py +++ b/hyperion/torch/utils/ddp.py @@ -16,26 +16,37 @@ def add_ddp_args(parser): - parser.add_argument('--num-gpus', type=int, default=1, - help='number of gpus, if 0 it uses cpu') - parser.add_argument('--node-id', type=int, default=0, - help='node id for distributed training') - parser.add_argument('--num-nodes', type=int, default=1, - help='number of nodes in which we distribute the training') - parser.add_argument('--master-addr', default='localhost', - help='address of the master node') - parser.add_argument('--master-port', default='1234', - help='port of the master node, if None it will be random') + parser.add_argument( + "--num-gpus", type=int, default=1, help="number of gpus, if 0 it uses cpu" + ) + parser.add_argument( + "--node-id", type=int, default=0, help="node id for distributed training" + ) + parser.add_argument( + "--num-nodes", + type=int, + default=1, + help="number of nodes in which we distribute the training", + ) + parser.add_argument( + "--master-addr", default="localhost", help="address of the master node" + ) + parser.add_argument( + "--master-port", + default="1234", + help="port of the master node, if None it will be random", + ) def filter_ddp_args(**kwargs): - valid_args = ('num_gpus', 'node_id', 'num_nodes', 'master_addr', 'master_port') - args = dict((k, kwargs[k]) - for k in valid_args if k in kwargs) + valid_args = ("num_gpus", "node_id", "num_nodes", "master_addr", "master_port") + args = dict((k, kwargs[k]) for k in valid_args if k in kwargs) return args -def ddp_init(gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr='localhost', master_port=None): +def ddp_init( + gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr="localhost", master_port=None +): rank = node_id * num_gpus + gpu_id world_size = num_nodes * num_gpus @@ -46,11 +57,13 @@ def ddp_init(gpu_id, num_gpus, node_id=0, num_nodes=1, master_addr='localhost', torch.cuda.set_device(gpu_id) torch.tensor([0]).to(gpu_id) - os.environ['MASTER_ADDR'] = master_addr - os.environ['MASTER_PORT'] = master_port + os.environ["MASTER_ADDR"] = master_addr + os.environ["MASTER_PORT"] = master_port - logging.info(f'init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port}') - dist.init_process_group('nccl', rank=rank, world_size=world_size) + logging.info( + f"init ddp rank={rank} world_size={world_size} master={master_addr}:{master_port}" + ) + dist.init_process_group("nccl", rank=rank, world_size=world_size) return gpu_id, rank, world_size @@ -83,5 +96,3 @@ def __getattr__(self, name): return super().__getattr__(name) except AttributeError: return getattr(self.module, name) - - diff --git a/hyperion/torch/utils/devices.py b/hyperion/torch/utils/devices.py index 6e5b75c5..16c61a48 100644 --- a/hyperion/torch/utils/devices.py +++ b/hyperion/torch/utils/devices.py @@ -8,36 +8,37 @@ import torch + def open_device(num_gpus=1, gpu_ids=None, find_free_gpu=False): if find_free_gpu: - os.environ['CUDA_DEVICE_ORDER']='PCI_BUS_ID' + os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" if gpu_ids is None: gpu_ids = find_free_gpus(num_gpus) if isinstance(gpu_ids, list): - gpu_ids = ','.join([str(i) for i in gpu_ids]) - - os.environ['CUDA_VISIBLE_DEVICES'] = gpu_ids - - if num_gpus > 0 and torch.cuda.is_available(): - logging.info('CUDA_VISIBLE_DEVICES=%s' % os.environ['CUDA_VISIBLE_DEVICES']) - logging.info('init gpu device') - device = torch.device('cuda', 0) + gpu_ids = ",".join([str(i) for i in gpu_ids]) + + os.environ["CUDA_VISIBLE_DEVICES"] = gpu_ids + + if num_gpus > 0 and torch.cuda.is_available(): + logging.info("CUDA_VISIBLE_DEVICES=%s" % os.environ["CUDA_VISIBLE_DEVICES"]) + logging.info("init gpu device") + device = torch.device("cuda", 0) torch.tensor([0]).to(device) # reserve the rest of gpus for n in range(1, num_gpus): - device_n = torch.device('cuda', n) - #torch.tensor([0]).to(device_n) + device_n = torch.device("cuda", n) + # torch.tensor([0]).to(device_n) else: - logging.info('init cpu device') - device = torch.device('cpu') + logging.info("init cpu device") + device = torch.device("cpu") return device def find_free_gpus(num_gpus): try: - result = subprocess.run('free-gpu', stdout=subprocess.PIPE) - gpu_ids = result.stdout.decode('utf-8') + result = subprocess.run("free-gpu", stdout=subprocess.PIPE) + gpu_ids = result.stdout.decode("utf-8") except: - gpu_ids = '0' + gpu_ids = "0" return gpu_ids diff --git a/hyperion/torch/utils/distributions.py b/hyperion/torch/utils/distributions.py index db6f8d85..4793fa69 100644 --- a/hyperion/torch/utils/distributions.py +++ b/hyperion/torch/utils/distributions.py @@ -7,6 +7,7 @@ import torch import torch.distributions as dists + def squeeze_pdf(pdf, dim): if isinstance(pdf, dists.normal.Normal): @@ -22,39 +23,32 @@ def squeeze_pdf_(pdf, dim): pdf.scale.squeeze_(dim=dim) - def serialize_pdf_to_dict(pdf): """Serializes pdfs to a dictionary - - When we want to return a pdf in a forward function, + + When we want to return a pdf in a forward function, and we are using DataParallel, we need to transform the pdf into a - dictionary of tensors because DataParallel only is able to combine + dictionary of tensors because DataParallel only is able to combine tensors from multiple GPUs but not other objects like distributions. """ if isinstance(pdf, dists.normal.Normal): - return {'normal.loc': pdf.loc, 'normal.scale': pdf.scale } + return {"normal.loc": pdf.loc, "normal.scale": pdf.scale} else: raise NotImplementedError() - def deserialize_pdf_from_dict(pdf): """Derializes pdfs from a dictionary - - When we want to return a pdf in a forward function, + + When we want to return a pdf in a forward function, and we are using DataParallel, we need to transform the pdf into a - dictionary of tensors because DataParallel only is able to combine + dictionary of tensors because DataParallel only is able to combine tensors from multiple GPUs but not other objects like distributions. This function will transform the dictionary back into torch.distribution objects """ - pdf_type = re.sub(r'.*','', pdf.keys()[0]) - if pdf_type == 'normal': - return dists.normal.Normal( - loc=pdf['normal.loc'], scale=pdf['normal.scale']) + pdf_type = re.sub(r".*", "", pdf.keys()[0]) + if pdf_type == "normal": + return dists.normal.Normal(loc=pdf["normal.loc"], scale=pdf["normal.scale"]) else: raise NotImplementedError() - - - - diff --git a/hyperion/torch/utils/eval_utils.py b/hyperion/torch/utils/eval_utils.py index 16a3e9b0..e8fa9c86 100644 --- a/hyperion/torch/utils/eval_utils.py +++ b/hyperion/torch/utils/eval_utils.py @@ -6,13 +6,14 @@ import math import torch + def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1): # model_device = next(nnet.parameters()).device # print(device, model_device, x.device) - #assume time is the last dimension + # assume time is the last dimension device = None if nnet.device == x.device else nnet.device - + T = x.shape[time_dim] if T <= chunk_length or chunk_length == 0: if device is not None: @@ -34,21 +35,20 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 try: out_shape = nnet.out_shape(in_shape) T_out = out_shape[time_dim] - r = float(T_out)/T + r = float(T_out) / T except: out_shape = None - - num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) - #move time dimension to dim 0 - x = x.transpose(0, time_dim) + num_chunks = int(math.ceil((T - chunk_length) / chunk_shift_in + 1)) + # move time dimension to dim 0 + x = x.transpose(0, time_dim) y = None tbeg_in = 0 tbeg_out = 0 for i in range(num_chunks): tend_in = min(tbeg_in + chunk_length, x.shape[0]) - #get slice and move back time dimension to last dim - x_i = x[tbeg_in:tend_in].transpose(0, time_dim) + # get slice and move back time dimension to last dim + x_i = x[tbeg_in:tend_in].transpose(0, time_dim) if device is not None: x_i = x_i.to(device) @@ -59,7 +59,7 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 chunk_length_out = y_i.shape[time_dim] if out_shape is None: # infer chunk_shift in the output - r = float(chunk_length_out)/chunk_length + r = float(chunk_length_out) / chunk_length # infer total output length T_out = int(r * T) @@ -67,26 +67,28 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 out_shape[time_dim] = T_out if y is None: - right_context_out = int(math.floor(r*right_context)) - left_context_out = int(math.floor(r*left_context)) + right_context_out = int(math.floor(r * right_context)) + left_context_out = int(math.floor(r * left_context)) chunk_shift_out = chunk_length_out - right_context_out - left_context_out # create output tensor y = torch.zeros(out_shape) - #move time dimension to dim 0 + # move time dimension to dim 0 y = y.transpose(0, time_dim) - + y_i = y_i.transpose(0, time_dim) if i == 0: tend_out = min(tbeg_out + chunk_length_out, T_out) y[tbeg_out:tend_out] = y_i - tbeg_out =+ (chunk_length_out - right_context_out) + tbeg_out = +(chunk_length_out - right_context_out) else: - tend_out = min(int(round(tbeg_out)) + chunk_length_out - left_context_out, T_out) + tend_out = min( + int(round(tbeg_out)) + chunk_length_out - left_context_out, T_out + ) dt = tend_out - tbeg_out if dt > 0: - #print('eu', tbeg_out, tend_out, left_context_out,left_context_out+dt, T_out, chunk_length, chunk_length_out, tbeg_in, tend_in) - y[tbeg_out:tend_out] = y_i[left_context_out:left_context_out+dt] + # print('eu', tbeg_out, tend_out, left_context_out,left_context_out+dt, T_out, chunk_length, chunk_length_out, tbeg_in, tend_in) + y[tbeg_out:tend_out] = y_i[left_context_out : left_context_out + dt] tbeg_out += chunk_shift_out tbeg_in += chunk_shift_in @@ -95,14 +97,15 @@ def eval_nnet_by_chunks(x, nnet, chunk_length=0, detach_chunks=True, time_dim=-1 y = y.transpose(0, time_dim) return y - -def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_chunks=True, time_dim=-1): +def eval_nnet_overlap_add( + x, nnet, chunk_length=0, chunk_overlap=None, detach_chunks=True, time_dim=-1 +): device = None if nnet.device == x.device else nnet.device - #assume time is the last dimension + # assume time is the last dimension T = x.shape[time_dim] if T <= chunk_length or chunk_length == 0: if device is not None: @@ -113,7 +116,7 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch return y if chunk_overlap is None: - #infer chunk overlap from network input context + # infer chunk overlap from network input context try: left_context, right_context = nnet.in_context() except: @@ -121,29 +124,27 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch chunk_overlap = left_context + right_context - in_shape = x.shape chunk_shift_in = chunk_length - chunk_overlap try: out_shape = nnet.out_shape(in_shape) T_out = out_shape[time_dim] - r = float(T_out)/T + r = float(T_out) / T except: out_shape = None - - num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) - #move time dimension to dim 0 - x = x.transpose(0, time_dim) + num_chunks = int(math.ceil((T - chunk_length) / chunk_shift_in + 1)) + # move time dimension to dim 0 + x = x.transpose(0, time_dim) y = None N = None tbeg_in = 0 tbeg_out = 0 for i in range(num_chunks): tend_in = min(tbeg_in + chunk_length, x.shape[0]) - #get slice and move back time dimension to last dim - x_i = x[tbeg_in:tend_in].transpose(0, time_dim) + # get slice and move back time dimension to last dim + x_i = x[tbeg_in:tend_in].transpose(0, time_dim) if device is not None: x_i = x_i.to(device) @@ -154,7 +155,7 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch chunk_length_out = y_i.shape[time_dim] if out_shape is None: # infer chunk_shift in the output - r = float(chunk_length_out)/chunk_length + r = float(chunk_length_out) / chunk_length # infer total output length T_out = int(r * T) @@ -162,13 +163,13 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch out_shape[time_dim] = T_out if y is None: - chunk_shift_out = r*chunk_shift_in + chunk_shift_out = r * chunk_shift_in # create output tensor y = torch.zeros(out_shape) - #move time dimension to dim 0 + # move time dimension to dim 0 y = y.transpose(0, time_dim) count = torch.zeros(T_out) - + y_i = y_i.transpose(0, time_dim) tend_out = min(int(round(tbeg_out)) + chunk_length_out, T_out) @@ -179,10 +180,11 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch tbeg_in += chunk_shift_in # put time dimension back in his place and normalize - y = y.transpose(0, time_dim)/count + y = y.transpose(0, time_dim) / count return y - + + # """ # Copyright 2019 Johns Hopkins University (Author: Jesus Villalba) # Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) @@ -215,18 +217,18 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # r = float(T_out)/T # except: # out_shape = None - + # num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) # #move time dimension to dim 0 -# x = x.transpose(0, time_dim) +# x = x.transpose(0, time_dim) # y = None # tbeg_in = 0 # tbeg_out = 0 # for i in range(num_chunks): # tend_in = min(tbeg_in + chunk_length, x.shape[0]) # #get slice and move back time dimension to last dim -# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) +# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) # if device is not None: # x_i = x_i.to(device) @@ -249,7 +251,7 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # y = torch.zeros(out_shape) # #move time dimension to dim 0 # y = y.transpose(0, time_dim) - + # y_i = y_i.transpose(0, time_dim) # if i == 0: @@ -270,7 +272,6 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # y = y.transpose(0, time_dim) # return y - # def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, device=None, time_dim=-1): @@ -301,11 +302,11 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # r = float(T_out)/T # except: # out_shape = None - + # num_chunks = int(math.ceil((T-chunk_length)/chunk_shift_in+1)) # #move time dimension to dim 0 -# x = x.transpose(0, time_dim) +# x = x.transpose(0, time_dim) # y = None # N = None # tbeg_in = 0 @@ -313,7 +314,7 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # for i in range(num_chunks): # tend_in = min(tbeg_in + chunk_length, x.shape[0]) # #get slice and move back time dimension to last dim -# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) +# x_i = x[tbeg_in:tend_in].transpose(0, time_dim) # if device is not None: # x_i = x_i.to(device) @@ -335,7 +336,7 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # #move time dimension to dim 0 # y = y.transpose(0, time_dim) # count = torch.zeros(T_out) - + # y_i = y_i.transpose(0, time_dim) # tend_out = min(int(round(tbeg_out)) + chunk_length_out, T_out) @@ -349,4 +350,3 @@ def eval_nnet_overlap_add(x, nnet, chunk_length=0, chunk_overlap=None, detach_ch # y = y.transpose(0, time_dim)/count # return y - diff --git a/hyperion/torch/utils/math.py b/hyperion/torch/utils/math.py index b8122e49..82c5b2e9 100644 --- a/hyperion/torch/utils/math.py +++ b/hyperion/torch/utils/math.py @@ -5,7 +5,10 @@ import torch -def invert_trimat(A, lower=False, right_inv=False, return_logdet=False, return_inv=False): + +def invert_trimat( + A, lower=False, right_inv=False, return_logdet=False, return_inv=False +): """Inversion of triangular matrices. Returns lambda function f that multiplies the inverse of A times a vector. @@ -23,9 +26,9 @@ def invert_trimat(A, lower=False, right_inv=False, return_logdet=False, return_i """ if right_inv: - fh=lambda x: torch.triangular_solve(x.t(), A.t(), upper=lower)[0].t() + fh = lambda x: torch.triangular_solve(x.t(), A.t(), upper=lower)[0].t() else: - fh=lambda x: torch.triangular_solve(x, A, upper=not(lower))[0] + fh = lambda x: torch.triangular_solve(x, A, upper=not (lower))[0] if return_logdet or return_inv: r = [fh] diff --git a/hyperion/torch/utils/metric_acc.py b/hyperion/torch/utils/metric_acc.py index 73a007c2..d635310b 100644 --- a/hyperion/torch/utils/metric_acc.py +++ b/hyperion/torch/utils/metric_acc.py @@ -9,9 +9,10 @@ import torch import torch.distributed as dist + class MetricAcc(object): - """Class to accumulate metrics during an epoch. - """ + """Class to accumulate metrics during an epoch.""" + def __init__(self, device=None): self.keys = None self.acc = None @@ -26,43 +27,38 @@ def __init__(self, device=None): self.rank = rank self.world_size = world_size - def reset(self): - """Resets the accumulators. - """ + """Resets the accumulators.""" self.count = 0 if self.acc is not None: self.acc[:] = 0 - def _reduce(self, metrics): if self.world_size == 1: return - metrics_list = [v for k,v in metrics.items()] + metrics_list = [v for k, v in metrics.items()] metrics_tensor = torch.tensor(metrics_list, device=self.device) dist.reduce(metrics_tensor, 0, op=dist.ReduceOp.SUM) metrics_tensor /= self.world_size - for i,k in enumerate(metrics.keys()): + for i, k in enumerate(metrics.keys()): metrics[k] = metrics_tensor[i] - - - + def update(self, metrics, num_samples=1): """Updates the values of the metric - It uses recursive formula, it may be more numerically stable - - m^(i) = m^(i-1) + n^(i)/sum(n^(i)) (x^(i) - m^(i-1)) - - where i is the batch number, - m^(i) is the accumulated average of the metric at batch i, - x^(i) is the average of the metric at batch i, - n^(i) is the batch_size at batch i. - - Args: - metrics: dictionary with metrics for current batch - num_samples: number of samples in current batch (batch_size) + It uses recursive formula, it may be more numerically stable + + m^(i) = m^(i-1) + n^(i)/sum(n^(i)) (x^(i) - m^(i-1)) + + where i is the batch number, + m^(i) is the accumulated average of the metric at batch i, + x^(i) is the average of the metric at batch i, + n^(i) is the batch_size at batch i. + + Args: + metrics: dictionary with metrics for current batch + num_samples: number of samples in current batch (batch_size) """ self._reduce(metrics) if self.rank != 0: @@ -73,22 +69,17 @@ def update(self, metrics, num_samples=1): self.acc = np.zeros((len(self.keys),)) self.count += num_samples - r = num_samples/self.count + r = num_samples / self.count for i, k in enumerate(self.keys): self.acc[i] += r * (metrics[k] - self.acc[i]) - - @property def metrics(self): - """ Returns metrics dictionary - """ + """Returns metrics dictionary""" if self.rank != 0: return {} logs = ODict() - for i,k in enumerate(self.keys): + for i, k in enumerate(self.keys): logs[k] = self.acc[i] return logs - - diff --git a/hyperion/torch/utils/misc.py b/hyperion/torch/utils/misc.py index bdd7086f..2b4f6034 100644 --- a/hyperion/torch/utils/misc.py +++ b/hyperion/torch/utils/misc.py @@ -6,6 +6,7 @@ import torch import torch.cuda.amp as amp + def l2_norm(x, axis=-1): with amp.autocast(enabled=False): norm = torch.norm(x.float(), 2, axis, True) + 1e-10 @@ -14,8 +15,8 @@ def l2_norm(x, axis=-1): def compute_snr(x, n, axis=-1): - P_x = 10*torch.log10(torch.mean(x**2, dim=axis)) - P_n = 10*torch.log10(torch.mean(n**2, dim=axis)) + P_x = 10 * torch.log10(torch.mean(x ** 2, dim=axis)) + P_n = 10 * torch.log10(torch.mean(n ** 2, dim=axis)) return P_x - P_n @@ -26,31 +27,26 @@ def compute_stats_adv_attack(x, x_adv): x_adv = torch.flatten(x_adv, start_dim=1) noise = x_adv - x - P_x = 10 * torch.log10(torch.mean(x**2, dim=-1)) - P_n = 10 * torch.log10(torch.mean(noise**2, dim=-1)) + P_x = 10 * torch.log10(torch.mean(x ** 2, dim=-1)) + P_n = 10 * torch.log10(torch.mean(noise ** 2, dim=-1)) snr = P_x - P_n - #x_l1 = torch.sum(torch.abs(x), dim=-1) + # x_l1 = torch.sum(torch.abs(x), dim=-1) x_l2 = torch.norm(x, dim=-1) x_linf = torch.max(x, dim=-1)[0] abs_n = torch.abs(noise) - n_l0 = torch.sum(abs_n>0, dim=-1).float() - #n_l1 = torch.sum(abs_n, dim=-1) + n_l0 = torch.sum(abs_n > 0, dim=-1).float() + # n_l1 = torch.sum(abs_n, dim=-1) n_l2 = torch.norm(noise, dim=-1) n_linf = torch.max(noise, dim=-1)[0] return snr, P_x, P_n, x_l2, x_linf, n_l0, n_l2, n_linf - + def get_selfsim_tarnon(y, return_mask=False): y_bin = y.unsqueeze(-1) - y.unsqueeze(0) + 1 - y_bin[y_bin!=1] = 0 + y_bin[y_bin != 1] = 0 y_bin = y_bin.float() if not return_mask: return y_bin - mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), - diagonal=1) + mask = torch.triu(torch.ones_like(y_bin, dtype=torch.bool), diagonal=1) return y_bin, mask - - - - diff --git a/hyperion/transforms/__init__.py b/hyperion/transforms/__init__.py index 1840bf5b..3f6c5f45 100644 --- a/hyperion/transforms/__init__.py +++ b/hyperion/transforms/__init__.py @@ -18,4 +18,3 @@ from .cent_whiten_up import CentWhitenUP from .lnorm_up import LNormUP - diff --git a/hyperion/transforms/cent_whiten.py b/hyperion/transforms/cent_whiten.py index fd3e2beb..00a83cca 100644 --- a/hyperion/transforms/cent_whiten.py +++ b/hyperion/transforms/cent_whiten.py @@ -11,9 +11,10 @@ from ..hyp_model import HypModel from ..pdfs import Normal + class CentWhiten(HypModel): - """Class to do centering and whitening of i-vectors. - """ + """Class to do centering and whitening of i-vectors.""" + def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): super().__init__(**kwargs) self.mu = mu @@ -21,24 +22,20 @@ def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): self.update_mu = update_mu self.update_T = update_T - - def predict(self, x): if self.mu is not None: x = x - self.mu if self.T is not None: if self.T.ndim == 1: - x = x*T + x = x * T else: x = np.dot(x, self.T) return x - - def fit(self, x=None, sample_weight=None, mu=None, S=None): - + if x is not None: - if x.shape[0]>x.shape[1]: + if x.shape[0] > x.shape[1]: gauss = Normal(x_dim=x.shape[1]) gauss.fit(x=x, sample_weight=sample_weight) mu = gauss.mu @@ -52,84 +49,71 @@ def fit(self, x=None, sample_weight=None, mu=None, S=None): if self.update_T: d, V = la.eigh(S) - V *= np.sqrt(1/d) + V *= np.sqrt(1 / d) V = np.fliplr(V) - - p = V[0,:] < 0 - V[:,p] *= -1 - + + p = V[0, :] < 0 + V[:, p] *= -1 + nonzero = d > 0 if not np.all(nonzero): V = V[:, nonzero[::-1]] - - self.T = V - + self.T = V def get_config(self): - config = {'update_mu': self.update_mu, - 'update_t': self.update_T } + config = {"update_mu": self.update_mu, "update_t": self.update_T} base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {'mu': self.mu, - 'T': self.T} + params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'T'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(mu=params['mu'], T=params['T'], name=config['name']) + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], T=params["T"], name=config["name"]) - - @classmethod def load_mat(cls, file_path): - with h5py.File(file_path, 'r') as f: - mu = np.asarray(f['mu'], dtype='float32') - T = np.asarray(f['T'], dtype='float32') + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") return cls(mu, T) - - def save_mat(self, file_path): - with h5py.File(file_path, 'w') as f: - f.create_dataset('mu', data=self.mu) - f.create_dataset('T', data=self.T) - - + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) @staticmethod def filter_args(**kwargs): - valid_args = ('update_mu', 'update_T', 'name') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + valid_args = ("update_mu", "update_T", "name") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'update-mu', default=True, - type=bool, - help=('updates centering parameter')) + p1 = "--" + prefix + "." - parser.add_argument(p1+'update-T', default=True, - type=bool, - help=('updates whitening parameter')) + parser.add_argument( + p1 + "update-mu", + default=True, + type=bool, + help=("updates centering parameter"), + ) - parser.add_argument(p1+'name', default='lnorm') + parser.add_argument( + p1 + "update-T", + default=True, + type=bool, + help=("updates whitening parameter"), + ) + parser.add_argument(p1 + "name", default="lnorm") add_argparse_args = add_class_args - diff --git a/hyperion/transforms/cent_whiten_up.py b/hyperion/transforms/cent_whiten_up.py index 57679b81..f22488f4 100644 --- a/hyperion/transforms/cent_whiten_up.py +++ b/hyperion/transforms/cent_whiten_up.py @@ -12,28 +12,22 @@ from ..pdfs import Normal from .cent_whiten import CentWhiten + class CentWhitenUP(CentWhiten): - """Class to do centering and whitening with uncertainty propagation. - """ - def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): - super(CentWhitenUP, self).__init__( - mu, T, update_mu, update_T, **kwargs) + """Class to do centering and whitening with uncertainty propagation.""" + def __init__(self, mu=None, T=None, update_mu=True, update_T=True, **kwargs): + super(CentWhitenUP, self).__init__(mu, T, update_mu, update_T, **kwargs) - def predict(self, x): - x_dim = int(x.shape[-1]/2) - m_x = x[:,:x_dim] - s2_x = x[:,x_dim:] + x_dim = int(x.shape[-1] / 2) + m_x = x[:, :x_dim] + s2_x = x[:, x_dim:] m_x = super(CentWhitenUP, self).predict(m_x) for i in range(x.shape[0]): s2_x[i] = np.diag(np.dot(self.T.T * s2_x[i], self.T)) return np.hstack((m_x, s2_x)) - - def fit(self, x, sample_weight=None): - x = x[:,:int(x.shape[-1]/2)] + x = x[:, : int(x.shape[-1] / 2)] super(CentWhitenUP, self).fit(x, sample_weight=sample_weight) - - diff --git a/hyperion/transforms/coral.py b/hyperion/transforms/coral.py index e2338fe6..0c9dea85 100644 --- a/hyperion/transforms/coral.py +++ b/hyperion/transforms/coral.py @@ -10,10 +10,21 @@ from ..hyp_model import HypModel + class CORAL(HypModel): - """Class to do CORAL - """ - def __init__(self, mu=None, T_col=None, T_white=None, update_mu=True, update_T=True, alpha_mu=1, alpha_T=1, **kwargs): + """Class to do CORAL""" + + def __init__( + self, + mu=None, + T_col=None, + T_white=None, + update_mu=True, + update_T=True, + alpha_mu=1, + alpha_T=1, + **kwargs + ): super(CORAL, self).__init__(**kwargs) self.mu = mu self.T_col = T_col @@ -24,21 +35,19 @@ def __init__(self, mu=None, T_col=None, T_white=None, update_mu=True, update_T=T self.alpha_mu = alpha_mu self.alpha_T = alpha_T - def get_config(self): - config = {'update_mu': self.update_mu, - 'update_t': self.update_T, - 'pca_dim': self.pca_dim} + config = { + "update_mu": self.update_mu, + "update_t": self.update_T, + "pca_dim": self.pca_dim, + } base_config = super(CORAL, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def _compute_T(self): if self.T_col is not None and self.T_white is not None: self.T = np.dot(self.T_white, self.T_col) - def predict(self, x): if self.T is None: self._compute_T() @@ -46,11 +55,10 @@ def predict(self, x): x = x - self.mu if self.T is not None: - x = np.dot(x, self.T) + x = np.dot(x, self.T) return x - def fit(self, x, sample_weight=None, x_out=None, sample_weight_out=None): if x_out is None: @@ -59,37 +67,42 @@ def fit(self, x, sample_weight=None, x_out=None, sample_weight_out=None): mu_out = np.mean(x_out, axis=0) if self.update_T: delta = x_out - mu_out - S_out = np.dot(delta.T, delta)/x_out.shape[0] + S_out = np.dot(delta.T, delta) / x_out.shape[0] # zero-phase component analysis (ZCA) d, V = la.eigh(S_out) - self.T_white = np.dot(V * (1/np.sqrt(d)), V.T) - + self.T_white = np.dot(V * (1 / np.sqrt(d)), V.T) + mu_in = np.mean(x, axis=0) if self.update_T: delta = x - mu_in - S_in = np.dot(delta.T, delta)/x.shape[0] + S_in = np.dot(delta.T, delta) / x.shape[0] if self.alpha_T < 1: S_in = self.alpha_T * S_in + (1 - self.alpha_T) * S_out # zero-phase component analysis (ZCA) d, V = la.eigh(S_in) - d[d<0] = 0 + d[d < 0] = 0 self.T_col = np.dot(V * np.sqrt(d), V.T) if self.update_mu: - self.mu = self.alpha_mu*(mu_out - mu_in) + self.mu = self.alpha_mu * (mu_out - mu_in) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'T_col', 'T_white'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(mu=params['mu'], T_col=params['T_col'], T_white=params['T_white'], name=config['name']) - + param_list = ["mu", "T_col", "T_white"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + mu=params["mu"], + T_col=params["T_col"], + T_white=params["T_white"], + name=config["name"], + ) def save_params(self, f): - params = {'mu': self.mu, - 'T_col': self.T_col, - 'T_white': self.T_white, - 'alpha_mu': self.alpha_mu, - 'alpha_T': self.alpha_T} + params = { + "mu": self.mu, + "T_col": self.T_col, + "T_white": self.T_white, + "alpha_mu": self.alpha_mu, + "alpha_T": self.alpha_T, + } self._save_params_from_dict(f, params) diff --git a/hyperion/transforms/gaussianizer.py b/hyperion/transforms/gaussianizer.py index 83f26323..ea512ade 100644 --- a/hyperion/transforms/gaussianizer.py +++ b/hyperion/transforms/gaussianizer.py @@ -15,100 +15,87 @@ class Gaussianizer(HypModel): - """Class to make i-vector distribution standard Normal. - """ + """Class to make i-vector distribution standard Normal.""" + def __init__(self, max_vectors=None, r=None, **kwargs): super(Gaussianizer, self).__init__(**kwargs) self.max_vectors = max_vectors self.r = r - - def predict(self, x): - px_cum = np.linspace(0, 1, self.r.shape[0]+2)[1:-1] - y_map = erfinv(2*px_cum-1)*np.sqrt(2) + px_cum = np.linspace(0, 1, self.r.shape[0] + 2)[1:-1] + y_map = erfinv(2 * px_cum - 1) * np.sqrt(2) - r=self.r[1:] + r = self.r[1:] y = np.zeros_like(x) for i in range(x.shape[1]): - y_index = np.searchsorted(r[:,i], x[:,i]) + y_index = np.searchsorted(r[:, i], x[:, i]) logging.debug(y_index) - y[:,i] = y_map[y_index] - - return y + y[:, i] = y_map[y_index] + return y - def fit(self, x): - r = np.sort(x, axis=0, kind='heapsort') + r = np.sort(x, axis=0, kind="heapsort") x = np.zeros((1, x.shape[-1]), dtype=float_cpu()) if r.shape[0] > self.max_vectors: - index = np.round(np.linspace(0, r.shape[0]-1, self.max_vectors, dtype=float)).astype(int) + index = np.round( + np.linspace(0, r.shape[0] - 1, self.max_vectors, dtype=float) + ).astype(int) r = r[index, :] self.r = np.vstack((x, r)) - - def get_config(self): - config = {'max_vectors': self.max_vectors} + config = {"max_vectors": self.max_vectors} base_config = super(Gaussianizer, self).get_config() return dict(list(base_config.items()) + list(config.items())) - - def save_params(self, f): - params = {'r': self.r } + params = {"r": self.r} self._save_params_from_dict(f, params) - - @classmethod def load_params(cls, f, config): - param_list = ['r'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(r=params['r'], max_vectors=config['max_vectors'], name=config['name']) + param_list = ["r"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls( + r=params["r"], max_vectors=config["max_vectors"], name=config["name"] + ) - - @classmethod def load_mat(cls, file_path): - with h5py.File(file_path, 'r') as f: - r = np.asarray(f['r'], dtype='float32') + with h5py.File(file_path, "r") as f: + r = np.asarray(f["r"], dtype="float32") return cls(r=r) - - def save_mat(self, file_path): - with h5py.File(file_path, 'w') as f: - f.create_dataset('r', data=self.r) - - + with h5py.File(file_path, "w") as f: + f.create_dataset("r", data=self.r) @staticmethod def filter_args(**kwargs): - valid_args = ('max_vectors', 'name') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + valid_args = ("max_vectors", "name") + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) @staticmethod def add_class_args(parser, prefix=None): if prefix is None: - p1 = '--' + p1 = "--" else: - p1 = '--' + prefix + '.' - - parser.add_argument(p1+'max-vectors', default=None, - type=int, - help=('maximum number of background vectors')) + p1 = "--" + prefix + "." - parser.add_argument(p1+'name', default='gauss') + parser.add_argument( + p1 + "max-vectors", + default=None, + type=int, + help=("maximum number of background vectors"), + ) + parser.add_argument(p1 + "name", default="gauss") add_arparse_args = add_class_args diff --git a/hyperion/transforms/lda.py b/hyperion/transforms/lda.py index 169521f4..142ed2bd 100644 --- a/hyperion/transforms/lda.py +++ b/hyperion/transforms/lda.py @@ -11,10 +11,13 @@ from ..hyp_model import HypModel from .sb_sw import SbSw + class LDA(HypModel): - """Class to do linear discriminant analysis. - """ - def __init__(self, mu=None, T=None, lda_dim=None, update_mu=True, update_T=True, **kwargs): + """Class to do linear discriminant analysis.""" + + def __init__( + self, mu=None, T=None, lda_dim=None, update_mu=True, update_T=True, **kwargs + ): super(LDA, self).__init__(**kwargs) self.mu = mu self.T = T @@ -25,19 +28,16 @@ def __init__(self, mu=None, T=None, lda_dim=None, update_mu=True, update_T=True, self.update_mu = update_mu self.update_T = update_T - - def predict(self, x): if self.mu is not None: x = x - self.mu return np.dot(x, self.T) - def fit(self, x, y, mu=None, Sb=None, Sw=None): if mu is None or Sb is None or Sw is None: sbsw = SbSw() - sbsw.fit(x,y) + sbsw.fit(x, y) mu = sbsw.mu Sb = sbsw.Sb Sw = sbsw.Sw @@ -47,48 +47,44 @@ def fit(self, x, y, mu=None, Sb=None, Sw=None): if not self.update_T: return - - assert(Sb.shape == Sw.shape) + + assert Sb.shape == Sw.shape try: d, V = la.eigh(Sb, Sw) except: alpha = 1e-2 * np.max(np.diag(Sw)) - d, V = la.eigh(Sb, alpha*np.eye(Sw.shape[0]) + Sw) + d, V = la.eigh(Sb, alpha * np.eye(Sw.shape[0]) + Sw) V = np.fliplr(V) - p = V[0,:] < 0 - V[:,p] *= -1 + p = V[0, :] < 0 + V[:, p] *= -1 - if self.lda_dim is not None: assert self.lda_dim <= V.shape[1] - V = V[:,:self.lda_dim] + V = V[:, : self.lda_dim] self.T = V - def get_config(self): - config = { 'lda_dim': self.lda_dim, - 'update_mu': self.update_mu, - 'update_t': self.update_T } + config = { + "lda_dim": self.lda_dim, + "update_mu": self.update_mu, + "update_t": self.update_T, + } base_config = super(LDA, self).get_config() return dict(list(base_config.items()) + list(config.items())) - def save_params(self, f): - params = {'mu': self.mu, - 'T': self.T} + params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'T'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(mu=params['mu'], T=params['T'], name=config['name']) + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], T=params["T"], name=config["name"]) - # @classmethod # def load(cls, file_path): # with h5py.File(file_path, 'r') as f: @@ -96,19 +92,15 @@ def load_params(cls, f, config): # param_list = ['mu', 'T'] # params = self._load_params_to_dict(f, config['name'], param_list) # return cls(mu=params['mu'], T=params['T'], name=config['name']) - - + @classmethod def load_mat(cls, file_path): - with h5py.File(file_path, 'r') as f: - mu = np.asarray(f['mu'], dtype='float32') - T = np.asarray(f['T'], dtype='float32') + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") return cls(mu, T) def save_mat(self, file_path): - with h5py.File(file_path, 'w') as f: - f.create_dataset('mu', data=self.mu) - f.create_dataset('T', data=self.T) - - - + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) diff --git a/hyperion/transforms/lnorm.py b/hyperion/transforms/lnorm.py index 4ffd5f5e..088748b2 100644 --- a/hyperion/transforms/lnorm.py +++ b/hyperion/transforms/lnorm.py @@ -7,11 +7,11 @@ from .cent_whiten import CentWhiten + class LNorm(CentWhiten): - """Class to do length normalization. - """ + """Class to do length normalization.""" + def predict(self, x): x = super().predict(x) - mx = np.sqrt(np.sum(x**2, axis=1, keepdims=True)) + 1e-10 - return np.sqrt(x.shape[1])*x/mx - + mx = np.sqrt(np.sum(x ** 2, axis=1, keepdims=True)) + 1e-10 + return np.sqrt(x.shape[1]) * x / mx diff --git a/hyperion/transforms/lnorm_up.py b/hyperion/transforms/lnorm_up.py index 9d3fd49d..ab7b1ec9 100644 --- a/hyperion/transforms/lnorm_up.py +++ b/hyperion/transforms/lnorm_up.py @@ -9,19 +9,18 @@ from .cent_whiten_up import CentWhitenUP + class LNormUP(CentWhitenUP): - """Class to do Lenght Normalization with uncertainty propagation - """ + """Class to do Lenght Normalization with uncertainty propagation""" def predict(self, x): x = super(LNormUP, self).predict(x) - x_dim = int(x.shape[-1]/2) - m_x = x[:,:x_dim] - s2_x = x[:,x_dim:] - - mx2 = np.sum(m_x**2, axis=1, keepdims=True) + 1e-10 + x_dim = int(x.shape[-1] / 2) + m_x = x[:, :x_dim] + s2_x = x[:, x_dim:] + + mx2 = np.sum(m_x ** 2, axis=1, keepdims=True) + 1e-10 m_x /= np.sqrt(mx2) s2_x /= mx2 - - return np.hstack((m_x, s2_x)) + return np.hstack((m_x, s2_x)) diff --git a/hyperion/transforms/mvn.py b/hyperion/transforms/mvn.py index 58d6562b..a3b77582 100644 --- a/hyperion/transforms/mvn.py +++ b/hyperion/transforms/mvn.py @@ -10,35 +10,32 @@ from ..hyp_model import HypModel + class MVN(HypModel): - """Class to do global mean and variance normalization. - """ + """Class to do global mean and variance normalization.""" + def __init__(self, mu=None, s=None, **kwargs): super(MVN, self).__init__(**kwargs) self.mu = mu self.s = s - def predict(self, x): if self.mu is not None: x = x - self.mu if self.s is not None: - x = x/self.s + x = x / self.s return x - + def fit(self, x): self.mu = np.mean(x, axis=0) self.s = np.std(x, axis=0) - def save_params(self, f): - params = {'mu': self.mu, - 's': self.s} + params = {"mu": self.mu, "s": self.s} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 's'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(mu=params['mu'], s=params['s'], name=config['name']) + param_list = ["mu", "s"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], s=params["s"], name=config["name"]) diff --git a/hyperion/transforms/nap.py b/hyperion/transforms/nap.py index 538f68be..6917c6b4 100644 --- a/hyperion/transforms/nap.py +++ b/hyperion/transforms/nap.py @@ -10,45 +10,41 @@ from ..hyp_model import HypModel + class NAP(HypModel): - """Class to do nussance attribute projection. - """ + """Class to do nussance attribute projection.""" + def __init__(self, U=None, **kwargs): super(NAP, self).__init__(**kwargs) self.U = U - def predict(self, x): return x - np.dot(np.dot(x, self.U.T), self.U) - def fit(self, x, U_dim, class_ids): x_hat = np.zeros_like(x) u_ids = np.uniqe(class_ids) M = np.sqrt(len(u_ids)) for i in u_ids: - idx = np.nonzero(i==class_ids) + idx = np.nonzero(i == class_ids) N = np.sqrt(len(idx)) - mu_i = np.mean(x[idx,:], axis=0) - xx[idx, :] = (x[idx, :] - mu_i)/N + mu_i = np.mean(x[idx, :], axis=0) + xx[idx, :] = (x[idx, :] - mu_i) / N xx /= M _, s, Vt = np.svd(xx, full_matrices=False, overwrite_a=True) idx = (np.argsort(s)[::-1])[:U_dim] self.U = Vt[idx, :] - def save_params(self, f): - params = {'U': self.U} + params = {"U": self.U} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['U'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(U=params['U'], name=config['name']) + param_list = ["U"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(U=params["U"], name=config["name"]) - # @classmethod # def load(cls, file_path): # with h5py.File(file_path, 'r') as f: @@ -56,19 +52,13 @@ def load_params(cls, f, config): # param_list = ['U'] # params = self._load_params_to_dict(f, config['name'], param_list) # return cls(U=params['U'], name=config['name']) - - + @classmethod def load_mat(cls, file_path): - with h5py.File(file_path, 'r') as f: - U = np.asarray(f['U'], dtype='float32') + with h5py.File(file_path, "r") as f: + U = np.asarray(f["U"], dtype="float32") return cls(U) - def save_mat(self, file_path): - with h5py.File(file_path, 'w') as f: - f.create_dataset('U', data=self.U) - - - - + with h5py.File(file_path, "w") as f: + f.create_dataset("U", data=self.U) diff --git a/hyperion/transforms/nda.py b/hyperion/transforms/nda.py index a69dee84..4f9772fc 100644 --- a/hyperion/transforms/nda.py +++ b/hyperion/transforms/nda.py @@ -11,22 +11,20 @@ from ..hyp_model import HypModel from ..hyp_defs import float_cpu + class NDA(HypModel): - """Class to do nearest-neighbors discriminant analysis - """ - + """Class to do nearest-neighbors discriminant analysis""" + def __init__(self, mu=None, T=None, **kwargs): super().__init__(**kwargs) self.mu = mu self.T = T - def predict(self, x): if self.mu is not None: x = x - self.mu return np.dot(x, self.T) - def fit(self, mu, Sb, Sw, nda_dim=None): self.mu = mu @@ -35,41 +33,33 @@ def fit(self, mu, Sb, Sw, nda_dim=None): d, V = la.eigh(Sb, Sw) V = np.fliplr(V) - p = V[0,:] < 0 - V[:,p] *= -1 + p = V[0, :] < 0 + V[:, p] *= -1 - if nda_dim is not None: assert nda_dim <= V.shape[1] - V = V[:,:nda_dim] + V = V[:, :nda_dim] self.T = V - - + def save_params(self, f): - params = {'mu': self.mu, - 'T': self.T} + params = {"mu": self.mu, "T": self.T} self._save_params_from_dict(f, params) - @classmethod def load_params(cls, f, config): - param_list = ['mu', 'T'] - params = cls._load_params_to_dict(f, config['name'], param_list) - return cls(mu=params['mu'], T=params['T'], name=config['name']) + param_list = ["mu", "T"] + params = cls._load_params_to_dict(f, config["name"], param_list) + return cls(mu=params["mu"], T=params["T"], name=config["name"]) - @classmethod def load_mat(cls, file_path): - with h5py.File(file_path, 'r') as f: - mu = np.asarray(f['mu'], dtype='float32') - T = np.asarray(f['T'], dtype='float32') + with h5py.File(file_path, "r") as f: + mu = np.asarray(f["mu"], dtype="float32") + T = np.asarray(f["T"], dtype="float32") return cls(mu, T) def save_mat(self, file_path): - with h5py.File(file_path, 'w') as f: - f.create_dataset('mu', data=self.mu) - f.create_dataset('T', data=self.T) - - - + with h5py.File(file_path, "w") as f: + f.create_dataset("mu", data=self.mu) + f.create_dataset("T", data=self.T) diff --git a/hyperion/transforms/pca.py b/hyperion/transforms/pca.py index cf3e7997..cd8d6973 100644 --- a/hyperion/transforms/pca.py +++ b/hyperion/transforms/pca.py @@ -12,8 +12,7 @@ class PCA(HypModel): - """Class to do principal component analysis - """ + """Class to do principal component analysis""" def __init__( self, diff --git a/hyperion/transforms/sb_sw.py b/hyperion/transforms/sb_sw.py index 30e71163..83c8d185 100644 --- a/hyperion/transforms/sb_sw.py +++ b/hyperion/transforms/sb_sw.py @@ -11,9 +11,10 @@ from ..hyp_model import HypModel from ..hyp_defs import float_cpu + class SbSw(HypModel): - """Class to compute between and within class matrices - """ + """Class to compute between and within class matrices""" + def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): super(SbSw, self).__init__(**kwargs) self.Sb = None @@ -21,7 +22,6 @@ def __init__(self, Sb=None, Sw=None, mu=None, num_classes=0, **kwargs): self.mu = None self.num_classes = num_classes - def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=True): dim = x.shape[1] if self.Sb is None: @@ -34,26 +34,22 @@ def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=Tr self.num_classes += len(u_ids) for i in u_ids: - idx = (class_ids==i) + idx = class_ids == i N_i = np.sum(idx) - mu_i = np.mean(x[idx,:], axis=0) + mu_i = np.mean(x[idx, :], axis=0) self.mu += mu_i x_i = x[idx, :] - mu_i self.Sb += np.outer(mu_i, mu_i) - self.Sw += np.dot(x_i.T, x_i)/N_i + self.Sw += np.dot(x_i.T, x_i) / N_i if normalize: self.normalize() - - def normalize(self): self.mu /= self.num_classes - self.Sb = self.Sb/self.num_classes - np.outer(self.mu, self.mu) + self.Sb = self.Sb / self.num_classes - np.outer(self.mu, self.mu) self.Sw /= self.num_classes - - @classmethod def accum_stats(cls, stats): mu = np.zeros_like(stats[0].mu) @@ -66,39 +62,32 @@ def accum_stats(cls, stats): Sw += s.Sw num_classes += s.num_classes return cls(mu=mu, Sb=Sb, Sw=Sw, num_classes=num_classes) - - def save_params(self, f): - params = {'mu': self.mu, - 'Sb': self.Sb, - 'Sw': self.Sw, - 'num_classes': self.num_classes} + params = { + "mu": self.mu, + "Sb": self.Sb, + "Sw": self.Sw, + "num_classes": self.num_classes, + } self._save_params_from_dict(f, params) - - @classmethod def load(cls, file_path): - with h5py.File(file_path,'r') as f: - config = self.load_config_from_json(f['config']) - param_list = ['mu', 'Sb', 'Sw', 'num_classes'] - params = cls._load_params_to_dict(f, config['name'], param_list) + with h5py.File(file_path, "r") as f: + config = self.load_config_from_json(f["config"]) + param_list = ["mu", "Sb", "Sw", "num_classes"] + params = cls._load_params_to_dict(f, config["name"], param_list) kwargs = dict(list(config.items()) + list(params.items())) return cls(**kwargs) - - - -class NSbSw(SbSw): +class NSbSw(SbSw): def __init__(self, K=10, alpha=1, **kwargs): super(NSbSw, self).__init__(**kwargs) self.K = K self.alpha = alpha - - def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=True): dim = x.shape[1] self.Sb = np.zeros((dim, dim), dtype=float_cpu()) @@ -106,54 +95,50 @@ def fit(self, x, class_ids, sample_weight=None, class_weights=None, normalize=Tr self.mu = np.zeros((dim,), dtype=float_cpu()) u_ids = np.unique(class_ids) - self.num_classes = np.max(u_ids)+1 + self.num_classes = np.max(u_ids) + 1 d = np.zeros((self.num_classes, x.shape[0]), dtype=float_cpu()) delta = np.zeros((self.num_classes,) + x.shape, dtype=float_cpu()) for i in u_ids: - idx_i = (class_ids==i) + idx_i = class_ids == i - mu_i = np.mean(x[idx_i,:], axis=0) + mu_i = np.mean(x[idx_i, :], axis=0) self.mu += mu_i x_i = x[idx_i] tree = BallTree(x_i) d_i, NN_i = tree.query(x, k=self.K, dualtree=True, sort_results=True) - d[i] = d_i[:,-1] + d[i] = d_i[:, -1] for l in range(x.shape[0]): - delta[i,l] = x[l] - np.mean(x_i[NN_i[l]], axis=0) + delta[i, l] = x[l] - np.mean(x_i[NN_i[l]], axis=0) - d = d**self.alpha + d = d ** self.alpha for i in u_ids: - idx_i = (class_ids==i).nonzero()[0] + idx_i = (class_ids == i).nonzero()[0] N_i = len(idx_i) w_i = 0 Sb_i = np.zeros(self.Sb.shape, dtype=float_cpu()) - + for j in range(self.num_classes): - w_ij = np.minimum(d[i], d[j])/(d[i]+d[j]) + w_ij = np.minimum(d[i], d[j]) / (d[i] + d[j]) for l in idx_i: - S = np.outer(delta[j,l], delta[j,l]) - if i==j: - self.Sw += S/N_i + S = np.outer(delta[j, l], delta[j, l]) + if i == j: + self.Sw += S / N_i else: - Sb_i += w_ij[l]*S + Sb_i += w_ij[l] * S w_i += w_ij[l] - self.Sb += Sb_i/w_i + self.Sb += Sb_i / w_i if normalize: self.normalize() - def normalize(self): self.mu /= self.num_classes self.Sb /= self.num_classes self.Sw /= self.num_classes - - def get_config(self): - config = { 'K': self.K, - 'alpha': self.alpha } + config = {"K": self.K, "alpha": self.alpha} base_config = super(NSbSw, self).get_config() return dict(list(base_config.items()) + list(config.items())) diff --git a/hyperion/transforms/skl_tsne.py b/hyperion/transforms/skl_tsne.py index 2f43400b..048be0c7 100644 --- a/hyperion/transforms/skl_tsne.py +++ b/hyperion/transforms/skl_tsne.py @@ -9,14 +9,15 @@ from ..hyp_model import HypModel + class SklTSNE(HypModel): """Wrapper class for sklearn TSNE manifold learner Attributes: tsne_dim: dimension of the embedded space. - perplexity: the perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. - early_exaggeration: controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. - lr: the learning rate for t-SNE is usually in the range [10.0, 1000.0]. + perplexity: the perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. + early_exaggeration: controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. + lr: the learning rate for t-SNE is usually in the range [10.0, 1000.0]. num_iter: maximum number of iterations for the optimization. num_iter_without_progress: maximum number of iterations without progress before we abort the optimization min_grad_norm: if the gradient norm is below this threshold, the optimization will be stopped. @@ -29,11 +30,26 @@ class SklTSNE(HypModel): angle: angle thetha in Barnes-Hut TSNE num_jobs: number of parallel jobs to run for neighbors search. """ - def __init__(self, tsne_dim=2, perplexity=30.0, early_exaggeration=12.0, - lr=200.0, num_iter=1000, num_iter_without_progress=300, - min_grad_norm=1e-07, metric='euclidean', init='random', - verbose=0, rng=None, rng_seed=1234, method='barnes_hut', - angle=0.5, num_jobs=None, **kwargs): + + def __init__( + self, + tsne_dim=2, + perplexity=30.0, + early_exaggeration=12.0, + lr=200.0, + num_iter=1000, + num_iter_without_progress=300, + min_grad_norm=1e-07, + metric="euclidean", + init="random", + verbose=0, + rng=None, + rng_seed=1234, + method="barnes_hut", + angle=0.5, + num_jobs=None, + **kwargs + ): super().__init__(**kwargs) self.rng_seed = rng_seed @@ -41,13 +57,21 @@ def __init__(self, tsne_dim=2, perplexity=30.0, early_exaggeration=12.0, rng = np.random.RandomState(seed=rng_seed) self._tsne = TSNE( - n_components=tsne_dim, perplexity=perplexity, - early_exaggeration=early_exaggeration, - learning_rate=lr, n_iter=num_iter, - n_iter_without_progress=num_iter_without_progress, - min_grad_norm=min_grad_norm, metric=metric, init=init, - verbose=verbose, random_state=rng, method=method, - angle=angle, n_jobs=num_jobs) + n_components=tsne_dim, + perplexity=perplexity, + early_exaggeration=early_exaggeration, + learning_rate=lr, + n_iter=num_iter, + n_iter_without_progress=num_iter_without_progress, + min_grad_norm=min_grad_norm, + metric=metric, + init=init, + verbose=verbose, + random_state=rng, + method=method, + angle=angle, + n_jobs=num_jobs, + ) @property def tsne_dim(self): @@ -74,7 +98,7 @@ def num_iter_without_progress(self): return self._tsne.n_iter_without_progress @property - def min_grad_norm(self): + def min_grad_norm(self): return self._tsne.min_grad_norm @property @@ -97,92 +121,123 @@ def angle(self): def num_jobs(self): return self._tsne.n_jobs - def predict(self, x): return self._tsne.fit_transform(x) def fit(self, x): return self._tsne.fit_transform(x) - def save_params(self, f): pass - + @classmethod def load_params(cls, f, config): return cls(**config) def get_config(self): - config = {'tsne_dim': self.tsne_dim, - 'perplexity': self.perplexity, - 'early_exaggeration': self.early_exaggeration, - 'lr': self.lr, - 'num_iter': self.num_iter, - 'num_iter_without_progress': self.num_iter_without_progress, - 'min_grad_norm': self.min_grad_norm, - 'metric': self.metric, - 'init': self.init, - 'rng_seed': self.rng_seed, - 'method': self.method, - 'angle': self.angle, - 'num_jobs': self.num_jobs} + config = { + "tsne_dim": self.tsne_dim, + "perplexity": self.perplexity, + "early_exaggeration": self.early_exaggeration, + "lr": self.lr, + "num_iter": self.num_iter, + "num_iter_without_progress": self.num_iter_without_progress, + "min_grad_norm": self.min_grad_norm, + "metric": self.metric, + "init": self.init, + "rng_seed": self.rng_seed, + "method": self.method, + "angle": self.angle, + "num_jobs": self.num_jobs, + } base_config = super().get_config() return dict(list(base_config.items()) + list(config.items())) - @staticmethod def filter_args(**kwargs): - valid_args = ('tsne_dim', 'perplexity', 'early_exaggeration', 'lr', - 'num_iter', 'num_iter_without_progress', - 'min_grad_norm', 'metric', - 'init', 'rng_seed', 'method', 'angle', 'num_jobs') - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - - + valid_args = ( + "tsne_dim", + "perplexity", + "early_exaggeration", + "lr", + "num_iter", + "num_iter_without_progress", + "min_grad_norm", + "metric", + "init", + "rng_seed", + "method", + "angle", + "num_jobs", + ) + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) + @staticmethod def add_class_args(parser, prefix=None): if prefix is not None: outer_parser = parser - parser = ArgumentParser(prog='') - - parser.add_argument('--tsne-dim', default=2, type=int, - help=('tsne dimension')) + parser = ArgumentParser(prog="") + + parser.add_argument("--tsne-dim", default=2, type=int, help=("tsne dimension")) - parser.add_argument('--perplexity', default=30., type=float, - help=('tsne perplexity')) parser.add_argument( - '--early-exaggeration', default=12., type=float, - help=('controls how tight natural clusters in the original space' - 'are in the embedded space and how much space will be ' - 'between them.')) - parser.add_argument('--lr', default=200., type=float, - help=('learning rate for t-sne')) - parser.add_argument('--num-iter', default=1000, type=int, - help=('max. number of iterations')) - parser.add_argument('--num-iter-without-progress', default=300, type=int, - help=('max. number of iterations without improvement')) - parser.add_argument('--min-grad-norm', default=1e-07, type=float, - help=('minimum gradient norm to stop optim.')) - parser.add_argument('--metric', default='euclidean', - choices=['cosine', 'euclidean', 'l1', 'l2', 'precomputed'], - help=('distance metric')) - parser.add_argument('--init', default='random', - choices=['random', 'pca'], - help=('initialization method')) - parser.add_argument('--method', default='barnes_hut', - choices=['barnes_hut', 'exact'], - help=('gradient calculation method')) - parser.add_argument('--angle', default=0.5, type=float, - help=('angle thetha in Barnes-Hut TSNE')) - parser.add_argument('--num-jobs', default=1, type=int, - help=('num parallel jobs for NN search')) - parser.add_argument('--rnd-seed', default=1234, type=int, - help=('random seed')) + "--perplexity", default=30.0, type=float, help=("tsne perplexity") + ) + parser.add_argument( + "--early-exaggeration", + default=12.0, + type=float, + help=( + "controls how tight natural clusters in the original space" + "are in the embedded space and how much space will be " + "between them." + ), + ) + parser.add_argument( + "--lr", default=200.0, type=float, help=("learning rate for t-sne") + ) + parser.add_argument( + "--num-iter", default=1000, type=int, help=("max. number of iterations") + ) + parser.add_argument( + "--num-iter-without-progress", + default=300, + type=int, + help=("max. number of iterations without improvement"), + ) + parser.add_argument( + "--min-grad-norm", + default=1e-07, + type=float, + help=("minimum gradient norm to stop optim."), + ) + parser.add_argument( + "--metric", + default="euclidean", + choices=["cosine", "euclidean", "l1", "l2", "precomputed"], + help=("distance metric"), + ) + parser.add_argument( + "--init", + default="random", + choices=["random", "pca"], + help=("initialization method"), + ) + parser.add_argument( + "--method", + default="barnes_hut", + choices=["barnes_hut", "exact"], + help=("gradient calculation method"), + ) + parser.add_argument( + "--angle", default=0.5, type=float, help=("angle thetha in Barnes-Hut TSNE") + ) + parser.add_argument( + "--num-jobs", default=1, type=int, help=("num parallel jobs for NN search") + ) + parser.add_argument("--rnd-seed", default=1234, type=int, help=("random seed")) if prefix is not None: - outer_parser.add_argument( - '--' + prefix, - action=ActionParser(parser=parser)) - + outer_parser.add_argument("--" + prefix, action=ActionParser(parser=parser)) + add_argparse_args = add_class_args diff --git a/hyperion/transforms/transform_list.py b/hyperion/transforms/transform_list.py index 26da7642..3e89966a 100644 --- a/hyperion/transforms/transform_list.py +++ b/hyperion/transforms/transform_list.py @@ -22,11 +22,9 @@ from .gaussianizer import Gaussianizer - class TransformList(HypModel): - """Class to perform a list of transformations - """ - + """Class to perform a list of transformations""" + def __init__(self, transforms, **kwargs): super(TransformList, self).__init__(**kwargs) if not isinstance(transforms, list): @@ -35,48 +33,41 @@ def __init__(self, transforms, **kwargs): if transforms is not None: self.update_names() - def append(self, t): self.transforms.append(t) if self.name is not None: - t.name = self.name + '/' + t.name + t.name = self.name + "/" + t.name - def predict(self, x): for t in self.transforms: x = t.predict(x) return x - def update_names(self): if self.name is not None: for t in self.transforms: - t.name = self.name + '/' + t.name + t.name = self.name + "/" + t.name - def get_config(self): config = super(TransformList, self).get_config() config_t = {} for i in range(len(self.transforms)): config_t[i] = self.transforms[i].get_config() - config['transforms'] = config_t + config["transforms"] = config_t return config - def save_params(self, f): for t in self.transforms: t.save_params(f) - @classmethod def load_params(cls, f, config): - config_ts = config['transforms'] + config_ts = config["transforms"] transforms = [] for i in range(len(config_ts)): config_t = config_ts[str(i)] logging.debug(config_t) - class_t = globals()[config_t['class_name']] + class_t = globals()[config_t["class_name"]] t = class_t.load_params(f, config_t) transforms.append(t) - return cls(transforms, name=config['name']) - + return cls(transforms, name=config["name"]) diff --git a/hyperion/utils/__init__.py b/hyperion/utils/__init__.py index a9a407c1..bfd81028 100644 --- a/hyperion/utils/__init__.py +++ b/hyperion/utils/__init__.py @@ -14,5 +14,3 @@ from .segment_list import SegmentList from .kaldi_matrix import KaldiMatrix, KaldiCompressedMatrix from .rttm import RTTM - - diff --git a/hyperion/utils/ext_segment_list.py b/hyperion/utils/ext_segment_list.py index 4ca6ce01..38a4a1b4 100644 --- a/hyperion/utils/ext_segment_list.py +++ b/hyperion/utils/ext_segment_list.py @@ -27,20 +27,21 @@ class ExtSegmentList(object): _uniq_series_id: unique series id. """ - def __init__(self, segments, ext_segments=None, files=None, index_column='file_id'): + def __init__(self, segments, ext_segments=None, files=None, index_column="file_id"): self.segments = segments if files is None: - file_id = self.segments['file_id'].unique() - files = pd.DataFrame({'file_id': file_id, 'series_id': file_id}) - + file_id = self.segments["file_id"].unique() + files = pd.DataFrame({"file_id": file_id, "series_id": file_id}) + if ext_segments is None: - if not 'ext_segment_id' in self.segments: + if not "ext_segment_id" in self.segments: self.segments = self.segments.assign( - ext_segment_id = self.segments['segment_id'].values) + ext_segment_id=self.segments["segment_id"].values + ) ext_segment_id = self.segments.ext_segment_id.unique() ext_segments = pd.DataFrame( - {'ext_segment_id': ext_segment_id, - 'name': np.nan, 'score': np.nan}) + {"ext_segment_id": ext_segment_id, "name": np.nan, "score": np.nan} + ) self.files = files self.ext_segments = ext_segments @@ -49,269 +50,251 @@ def __init__(self, segments, ext_segments=None, files=None, index_column='file_i self._uniq_series_id = None self.iter_idx = 0 - - @classmethod - def create(cls, segment_id, file_id, tbeg, tend, - ext_segment_id=None, series_id=None, name=np.nan, score=np.nan, - index_column='file_id'): + def create( + cls, + segment_id, + file_id, + tbeg, + tend, + ext_segment_id=None, + series_id=None, + name=np.nan, + score=np.nan, + index_column="file_id", + ): if ext_segment_id is None: ext_segment_id = segment_id - - segments = pd.DataFrame({'segment_id': segment_id, - 'file_id': file_id, - 'tbeg': tbeg, - 'tend': tend, - 'ext_segment_id': ext_segment_id}) - + + segments = pd.DataFrame( + { + "segment_id": segment_id, + "file_id": file_id, + "tbeg": tbeg, + "tend": tend, + "ext_segment_id": ext_segment_id, + } + ) + if series_id is None: - u_file_id = self.segments['file_id'].unique() - files = pd.DataFrame({'file_id': u_file_id, 'series_id': u_file_id}) + u_file_id = self.segments["file_id"].unique() + files = pd.DataFrame({"file_id": u_file_id, "series_id": u_file_id}) else: - file_id = [f for f in v for k,v in series_id.items()] - series_id = [k for f in v for k,v in series_id.items()] - files = pd.DataFrame({'file_id': file_id, 'series_id': series_id}) + file_id = [f for f in v for k, v in series_id.items()] + series_id = [k for f in v for k, v in series_id.items()] + files = pd.DataFrame({"file_id": file_id, "series_id": series_id}) if isinstance(name, str): - ext_segment_id = segments['ext_segment_id'].unique() + ext_segment_id = segments["ext_segment_id"].unique() elif isinstance(name, dict): - ext_segment_id = [k for k,v in name.items()] - name = [v for k,v in name.items()] + ext_segment_id = [k for k, v in name.items()] + name = [v for k, v in name.items()] if isinstance(score, dict): score = [score[k] for k in ext_segment_id] - ext_segments = pd.DataFrame({'ext_segment_id': ext_segment_id, - 'name': name, 'score': score}) - - return cls(segments, ext_segments, files, index_column) + ext_segments = pd.DataFrame( + {"ext_segment_id": ext_segment_id, "name": name, "score": score} + ) + return cls(segments, ext_segments, files, index_column) - @classmethod - def create_from_segment_list(cls, segment_list, series_id=None, name=np.nan, score=np.nan, - index_column='file_id'): + def create_from_segment_list( + cls, + segment_list, + series_id=None, + name=np.nan, + score=np.nan, + index_column="file_id", + ): segments = deepcopy(segment_list.segments) - segments = segments.assign( - ext_segment_id = segments['segment_id']) + segments = segments.assign(ext_segment_id=segments["segment_id"]) ext_segment_id = segments.ext_segment_id.unique() if not np.isnan(name): - name = [name[k] for k in segments['ext_segment_id'].values] + name = [name[k] for k in segments["ext_segment_id"].values] ext_segments = pd.DataFrame( - {'ext_segment_id': segments['ext_segment_id'].values, - 'name': name, 'score': score}) + { + "ext_segment_id": segments["ext_segment_id"].values, + "name": name, + "score": score, + } + ) if series_id is None: - u_file_id = segments['file_id'].unique() - files = pd.DataFrame({'file_id': u_file_id, 'series_id': u_file_id}) + u_file_id = segments["file_id"].unique() + files = pd.DataFrame({"file_id": u_file_id, "series_id": u_file_id}) else: - file_id = [f for f in v for k,v in series_id.items()] - series_id = [k for f in v for k,v in series_id.items()] - files = pd.DataFrame({'file_id': file_id, 'series_id': series_id}) + file_id = [f for f in v for k, v in series_id.items()] + series_id = [k for f in v for k, v in series_id.items()] + files = pd.DataFrame({"file_id": file_id, "series_id": series_id}) return cls(segments, ext_segments, files, index_column) - - def validate(self): - """Validates the attributes of the SegmentList object. - """ - - assert np.all(self.segments['tend']-self.segments['tbeg']>=0) - ok_tbeg = np.logical_or(self.tbeg[1:]-self.tbeg[:-1]>=0, - self.file_id[1:] != self.file_id[:-1]) + """Validates the attributes of the SegmentList object.""" + + assert np.all(self.segments["tend"] - self.segments["tbeg"] >= 0) + ok_tbeg = np.logical_or( + self.tbeg[1:] - self.tbeg[:-1] >= 0, self.file_id[1:] != self.file_id[:-1] + ) if not np.all(ok_tbeg): bad_tbeg = np.logical_not(ok_tbeg) - logging.critical({'file_id': self.file_id[1:][bad_tbeg], - 'tbeg':self.tbeg[1:][bad_tbeg]}) - raise Exception('tbeg is not in the right order') - - + logging.critical( + {"file_id": self.file_id[1:][bad_tbeg], "tbeg": self.tbeg[1:][bad_tbeg]} + ) + raise Exception("tbeg is not in the right order") @property def index_column(self): return self._index_column - - @index_column.setter def index_column(self, value): self._index_column = value self.ext_segments.index = self.ext_segments.ext_segment_id - if value == 'file_id': + if value == "file_id": self.segments.index = self.segments.file_id self.files.index = self.files.file_id - elif value == 'segment_id': + elif value == "segment_id": self.segments.index = self.segments.segment_id self.files.index = self.files.file_id - elif value == 'ext_segment_id': + elif value == "ext_segment_id": self.segments.index = self.segments.ext_segment_id self.files.index = self.files.file_id - elif value == 'series_id': + elif value == "series_id": self.segments.index = self.segments.file_id self.files.index = self.files.series_id - - @property def file_id(self): - return np.asarray(self.segments['file_id']) + return np.asarray(self.segments["file_id"]) - - @property def segment_id(self): - return np.asarray(self.segments['segment_id']) - + return np.asarray(self.segments["segment_id"]) - @property def ext_segment_id(self): - return np.asarray(self.segments['ext_segment_id']) - - + return np.asarray(self.segments["ext_segment_id"]) @property def segment_names(self): - return np.asarray(pd.merge(self.segments, self.ext_segments, - on='ext_segment_id', how='inner')['name']) + return np.asarray( + pd.merge( + self.segments, self.ext_segments, on="ext_segment_id", how="inner" + )["name"] + ) - @property def segment_names_index(self): _, index = np.unique(self.segment_names, return_inverse=True) return index - - @property def segment_score(self): - return np.asarray(pd.merge(self.segments, self.ext_segments, - on='ext_segment_id', how='inner')['score']) + return np.asarray( + pd.merge( + self.segments, self.ext_segments, on="ext_segment_id", how="inner" + )["score"] + ) - - @property def uniq_segment_id(self): - return np.asarray(self.ext_segments['ext_segment_id']) + return np.asarray(self.ext_segments["ext_segment_id"]) - - @property def series_id(self): - return np.asarray(self.files['series_id']) - + return np.asarray(self.files["series_id"]) - @property def uniq_file_id(self): - return np.asarry(self.files['file_id']) + return np.asarry(self.files["file_id"]) # if self._uniq_file_id is None: # self._uniq_file_id = np.asarray(self.segments['file_id'].unique()) - - # return self._uniq_file_id + # return self._uniq_file_id - @property def uniq_series_id(self): if self._uniq_series_id is None: - self._uniq_series_id = np.asarray(self.ext_segments['series_id'].unique()) - + self._uniq_series_id = np.asarray(self.ext_segments["series_id"].unique()) + return self._uniq_series_id - @property def num_ext_segments(self): return len(self.ext_segments) - @property def tbeg(self): - return np.asarray(self.segments['tbeg']) - + return np.asarray(self.segments["tbeg"]) - @property def tend(self): - return np.asarray(self.segments['tend']) + return np.asarray(self.segments["tend"]) - - def copy(self): """Makes a copy of the object.""" return deepcopy(self) - def segment_ids_from_file(self, file_id): - """Returns segments_ids corresponding to a given file_id - """ - if self.index_column == 'file_id': - return np.asarray(self.segments.loc[file_id]['segment_id']) - index = self.segments['file_id']==file_id - return np.asarray(self.segments.loc[index]['segment_id']) - + """Returns segments_ids corresponding to a given file_id""" + if self.index_column == "file_id": + return np.asarray(self.segments.loc[file_id]["segment_id"]) + index = self.segments["file_id"] == file_id + return np.asarray(self.segments.loc[index]["segment_id"]) def ext_segment_ids_from_file(self, file_id): - """Returns ext_segments_ids corresponding to a given file_id - """ - if self.index_column == 'file_id': - return np.unique(np.asarray(self.segments.loc[file_id]['ext_segment_id'])) - index = self.segments['file_id']==file_id - return np.unique(np.asarray(self.segments.loc[index]['ext_segment_id'])) + """Returns ext_segments_ids corresponding to a given file_id""" + if self.index_column == "file_id": + return np.unique(np.asarray(self.segments.loc[file_id]["ext_segment_id"])) + index = self.segments["file_id"] == file_id + return np.unique(np.asarray(self.segments.loc[index]["ext_segment_id"])) - - def __iter__(self): - self.iter_idx=0 + self.iter_idx = 0 return self - - def __next__(self): - if self.index_column == 'file_id': + if self.index_column == "file_id": if self.iter_idx < len(self.uniq_file_id): r = self.__getitem__(self.uniq_file_id[self.iter_idx]) else: raise StopIteration() - elif self.index_column == 'series_id': + elif self.index_column == "series_id": if self.iter_idx < len(self.uniq_series_id): r = self.__getitem__(self.uniq_series_id[self.iter_idx]) else: raise StopIteration() - elif self.index_column == 'ext_segment_id': + elif self.index_column == "ext_segment_id": if self.iter_idx < len(self.ext_segments): - r = self.__getitem__(self.ext_segment['ext_segment_id'].iloc[self.iter_idx]) + r = self.__getitem__( + self.ext_segment["ext_segment_id"].iloc[self.iter_idx] + ) else: raise StopIteration() else: if self.iter_idx < len(self.segments): - r = self.__getitem__(self.segments['segment_id'].iloc(self.iter_idx)) + r = self.__getitem__(self.segments["segment_id"].iloc(self.iter_idx)) else: raise StopIteration() self.iter_idx += 1 return r - - - def __len__(self): """Returns the number of segments in the list.""" return len(self.segments) - - + def __contains__(self, key): - """ Returns True if the segments contains the key""" + """Returns True if the segments contains the key""" return key in self.segments.segment_id - - - def __getitem__(self, key): """It allows to acces the de segments by file_id or segment like in a ditionary, e.g.: @@ -321,36 +304,50 @@ def __getitem__(self, key): Args: key: Segment or file key Returns: - if index_by_file is True if returns segments of a given file_id + if index_by_file is True if returns segments of a given file_id in SegmentsList format, else it returns DataFrame """ - if self.index_column == 'segment_id': - return pd.merge(self.segments.loc[key], self.ext_segments, - sort=False, how='inner') + if self.index_column == "segment_id": + return pd.merge( + self.segments.loc[key], self.ext_segments, sort=False, how="inner" + ) else: return self.filter([key]) - - - def save(self, file_path, sep=' '): + def save(self, file_path, sep=" "): """Saves segments to text file. Args: file_path: File to write the list. sep: Separator between the fields """ - self.segments[['segment_id', 'file_id', 'tbeg', 'tend', 'ext_segment_id']].to_csv( - file_path + '.segments', sep=sep, float_format='%.3f', index=False, header=False) - self.ext_segments[['ext_segment_id', 'name', 'score']].to_csv( - file_path + '.ext_segments', sep=sep, float_format='%.3f', - index=False, header=False, na_rep='NA') - self.files[['file_id', 'series_id']].to_csv( - file_path + '.files', sep=sep, float_format='%.3f', index=False, header=False) + self.segments[ + ["segment_id", "file_id", "tbeg", "tend", "ext_segment_id"] + ].to_csv( + file_path + ".segments", + sep=sep, + float_format="%.3f", + index=False, + header=False, + ) + self.ext_segments[["ext_segment_id", "name", "score"]].to_csv( + file_path + ".ext_segments", + sep=sep, + float_format="%.3f", + index=False, + header=False, + na_rep="NA", + ) + self.files[["file_id", "series_id"]].to_csv( + file_path + ".files", + sep=sep, + float_format="%.3f", + index=False, + header=False, + ) - - @classmethod - def load(cls, file_path, sep=' ', index_column='file_id'): + def load(cls, file_path, sep=" ", index_column="file_id"): """Loads script list from text file. Args: @@ -360,65 +357,71 @@ def load(cls, file_path, sep=' ', index_column='file_id'): Returns: SegmentList object. """ - segments = pd.read_csv(file_path + '.segments', sep=sep, header=None, - names=['segment_id','file_id','tbeg','tend', 'ext_segment_id']) - if path.isfile(file_path + '.ext_segments'): - ext_segments = pd.read_csv(file_path + '.ext_segments', sep=sep, header=None, - names=['ext_segment_id', 'name', 'score'], na_values='NA') + segments = pd.read_csv( + file_path + ".segments", + sep=sep, + header=None, + names=["segment_id", "file_id", "tbeg", "tend", "ext_segment_id"], + ) + if path.isfile(file_path + ".ext_segments"): + ext_segments = pd.read_csv( + file_path + ".ext_segments", + sep=sep, + header=None, + names=["ext_segment_id", "name", "score"], + na_values="NA", + ) else: ext_segments = None - - if path.isfile(file_path + '.files'): - files = pd.read_csv(file_path + '.files', sep=sep, header=None, - names=['file_id', 'series_id']) + + if path.isfile(file_path + ".files"): + files = pd.read_csv( + file_path + ".files", + sep=sep, + header=None, + names=["file_id", "series_id"], + ) else: files = None - - return cls(segments, ext_segments, files, index_column) - + return cls(segments, ext_segments, files, index_column) - def filter(self, filter_key, keep=True): - if self.index_column == 'series_id': + if self.index_column == "series_id": if not keep: - filter_key = np.setdiff1d( - np.asarray(self.files.index), filter_key) + filter_key = np.setdiff1d(np.asarray(self.files.index), filter_key) files = self.files.loc[filter_key] - segments = pd.merge(self.segments, files, on='file_id', how='inner')[ - ['segment_id', 'file_id', 'tbeg','tend','ext_segment_id']] + segments = pd.merge(self.segments, files, on="file_id", how="inner")[ + ["segment_id", "file_id", "tbeg", "tend", "ext_segment_id"] + ] else: if not keep: - filter_key = np.setdiff1d( - np.asarray(self.segments.index), filter_key) + filter_key = np.setdiff1d(np.asarray(self.segments.index), filter_key) segments = self.segments.loc[filter_key] - files = pd.merge(self.files, segments, on='file_id', how='inner')[ - ['file_id', 'series_id']] - - ext_segments = pd.merge(self.ext_segments, segments, - on='ext_segment_id', how='inner')[ - ['ext_segment_id','name']] + files = pd.merge(self.files, segments, on="file_id", how="inner")[ + ["file_id", "series_id"] + ] - return ExtSegmentList(segments, ext_segments, files, self.index_column) + ext_segments = pd.merge( + self.ext_segments, segments, on="ext_segment_id", how="inner" + )[["ext_segment_id", "name"]] + return ExtSegmentList(segments, ext_segments, files, self.index_column) - def split(self, idx, num_parts): - if self.index_column == 'file_id': + if self.index_column == "file_id": key, _ = split_list(self.uniq_file_id, idx, num_parts) - elif self.index_column == 'series_id': + elif self.index_column == "series_id": key, _ = split_list(self.uniq_series_id, idx, num_parts) - elif self.index_column == 'segment_id': + elif self.index_column == "segment_id": key, _ = split_list(self.segment_id, idx, num_parts) - elif self.index_column == 'ext_segment_id': + elif self.index_column == "ext_segment_id": key, _ = split_list(self.uniq_ext_segment_id, idx, num_parts) return self.filter(key) - - @classmethod - def merge(cls, segment_lists, index_column='file_id'): + def merge(cls, segment_lists, index_column="file_id"): segments = [] files = [] ext_segments = [] @@ -426,47 +429,40 @@ def merge(cls, segment_lists, index_column='file_id'): segments.append(sl.segments) files.append(sl.files) ext_segments.append(ext_segments) - + segments = pd.concat(segments).drop_duplicates() files = pd.concat(files).drop_duplicates() ext_segments = pd.concat(ext_segments).drop_duplicates() - + return cls(segments, ext_segments, files, index_column) - - def __eq__(self, other): """Equal operator""" eq = self.segments.equals(other.segments) eq = eq and self.index_by_file == other.index_by_file - - return eq + return eq - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(other): return 0 return 1 - def merge_adjacent_segments_old(self, max_segments=0): if max_segments == 0: max_segments = len(self.segments) - - segm = pd.merge(self.segments, self.ext_segments, - on='ext_segment_id', how='inner') + + segm = pd.merge( + self.segments, self.ext_segments, on="ext_segment_id", how="inner" + ) segm_1 = segm.shift(1) - index = ((segm.file_id == segm_1.file_id) & - (segm.name == segm_1.name)) + index = (segm.file_id == segm_1.file_id) & (segm.name == segm_1.name) merging = False count = 1 @@ -475,15 +471,18 @@ def merge_adjacent_segments_old(self, max_segments=0): if index.iloc[i]: merging = True if count == 1: - first_idx = i-1 + first_idx = i - 1 last_idx = i count += 1 - - if (not index.iloc[i] or i==len(self.segments)-1 - or count==max_segments) and merging: - if count==max_segments and i < len(self.segments)-1: + + if ( + not index.iloc[i] + or i == len(self.segments) - 1 + or count == max_segments + ) and merging: + if count == max_segments and i < len(self.segments) - 1: # logging.debug(index) - index.iloc[i+1] = False + index.iloc[i + 1] = False # logging.debug(index) r = self.copy() count = 1 @@ -491,34 +490,38 @@ def merge_adjacent_segments_old(self, max_segments=0): first_segment = self.segments.iloc[first_idx].segment_id last_segment = self.segments.iloc[last_idx].segment_id new_ext_segment_id = self.segments[ - first_idx:last_idx+1].segment_id.str.cat(sep='@') - old_ext_segment_ids = np.array(self.segments[ - first_idx:last_idx+1].ext_segment_id.unique()) - - if (len(old_ext_segment_ids) == 1 and - old_ext_segment_ids[0] == new_ext_segment_id): + first_idx : last_idx + 1 + ].segment_id.str.cat(sep="@") + old_ext_segment_ids = np.array( + self.segments[first_idx : last_idx + 1].ext_segment_id.unique() + ) + + if ( + len(old_ext_segment_ids) == 1 + and old_ext_segment_ids[0] == new_ext_segment_id + ): continue - + kkk = self.ext_segments.ext_segment_id == new_ext_segment_id - if np.sum(kkk)>0: + if np.sum(kkk) > 0: logging.debug(first_segment) logging.debug(last_segment) logging.debug(new_ext_segment_id) - r.save('rrrr') - self.save('pppp') - - - self.segments.iloc[first_idx:last_idx+1, - self.segments.columns.get_loc( - 'ext_segment_id')] = new_ext_segment_id - #logging.debug(old_ext_segment_ids) - #logging.debug('A',self.ext_segments.ext_segment_id) - #logging.debug(old_ext_segment_ids.iloc[0]) - #logging.debug(new_ext_segment_id) + r.save("rrrr") + self.save("pppp") + + self.segments.iloc[ + first_idx : last_idx + 1, + self.segments.columns.get_loc("ext_segment_id"), + ] = new_ext_segment_id + # logging.debug(old_ext_segment_ids) + # logging.debug('A',self.ext_segments.ext_segment_id) + # logging.debug(old_ext_segment_ids.iloc[0]) + # logging.debug(new_ext_segment_id) d[old_ext_segment_ids[0]] = new_ext_segment_id - #logging.debug(old_ext_segment_ids[1:]) - #logging.debug(old_ext_segment_ids[1:]) - #logging.debug(self.ext_segments) + # logging.debug(old_ext_segment_ids[1:]) + # logging.debug(old_ext_segment_ids[1:]) + # logging.debug(self.ext_segments) # self.ext_segments.drop(old_ext_segment_ids[1:], inplace=True) # for osid in old_ext_segment_ids[1:]: # kk = self.segments.ext_segment_id == osid @@ -528,37 +531,42 @@ def merge_adjacent_segments_old(self, max_segments=0): # logging.debug(self.segments[kk]) # logging.debug(new_ext_segment_id) # raise Exception() - #logging.debug('C',self.ext_segments) - if len(self.ext_segments.ext_segment_id.unique()) != len(self.ext_segments.ext_segment_id): + # logging.debug('C',self.ext_segments) + if len(self.ext_segments.ext_segment_id.unique()) != len( + self.ext_segments.ext_segment_id + ): logging.debug(first_segment) logging.debug(last_segment) logging.debug(new_ext_segment_id) - r.save('rrrr') - self.save('pppp') + r.save("rrrr") + self.save("pppp") - for k,v in d.items(): - self.ext_segments.loc[k,'ext_segment_id'] = v + for k, v in d.items(): + self.ext_segments.loc[k, "ext_segment_id"] = v self.ext_segments.reset_index(drop=True, inplace=True) - drop_index = (~self.ext_segments.ext_segment_id.isin(self.segments.ext_segment_id)) + drop_index = ~self.ext_segments.ext_segment_id.isin( + self.segments.ext_segment_id + ) drop_index = self.ext_segments.index[drop_index] self.ext_segments.drop(drop_index, inplace=True) - self.ext_segments = self.ext_segments.set_index(self.ext_segments.ext_segment_id, drop=False) - assert len(self.ext_segments.ext_segment_id.unique()) == len(self.ext_segments.ext_segment_id) - #logging.debug('E',self.ext_segments) + self.ext_segments = self.ext_segments.set_index( + self.ext_segments.ext_segment_id, drop=False + ) + assert len(self.ext_segments.ext_segment_id.unique()) == len( + self.ext_segments.ext_segment_id + ) + # logging.debug('E',self.ext_segments) - - - def merge_adjacent_segments(self, max_segments=0): if max_segments == 0: max_segments = len(self.segments) - - segm = pd.merge(self.segments, self.ext_segments, - on='ext_segment_id', how='inner') + + segm = pd.merge( + self.segments, self.ext_segments, on="ext_segment_id", how="inner" + ) segm_1 = segm.shift(1) - index = ((segm.file_id == segm_1.file_id) & - (segm.name == segm_1.name)) + index = (segm.file_id == segm_1.file_id) & (segm.name == segm_1.name) count = 1 first_idx = 0 @@ -566,31 +574,35 @@ def merge_adjacent_segments(self, max_segments=0): d = OrderedDict() # logging.debug('MERGE') # logging.debug(self.ext_segments) - for i in range(1, len(self.segments)+1): - if (i==len(self.segments) or not index.iloc[i] or - count==max_segments): - #logging.debug(i,first_idx, last_idx) + for i in range(1, len(self.segments) + 1): + if i == len(self.segments) or not index.iloc[i] or count == max_segments: + # logging.debug(i,first_idx, last_idx) new_ext_segment_id = self.segments[ - first_idx:last_idx+1].segment_id.str.cat(sep='@') - old_ext_segment_ids = np.array(self.segments[ - first_idx:last_idx+1].ext_segment_id.unique()) - - self.segments.iloc[first_idx:last_idx+1, - self.segments.columns.get_loc( - 'ext_segment_id')] = new_ext_segment_id - #logging.debug(old_ext_segment_ids) - #logging.debug('A',self.ext_segments.ext_segment_id) + first_idx : last_idx + 1 + ].segment_id.str.cat(sep="@") + old_ext_segment_ids = np.array( + self.segments[first_idx : last_idx + 1].ext_segment_id.unique() + ) + + self.segments.iloc[ + first_idx : last_idx + 1, + self.segments.columns.get_loc("ext_segment_id"), + ] = new_ext_segment_id + # logging.debug(old_ext_segment_ids) + # logging.debug('A',self.ext_segments.ext_segment_id) # logging.debug('OLD SEGMENTS') # logging.debug(old_ext_segment_ids) # logging.debug('NEW SEGMENTS') # logging.debug(new_ext_segment_id) # logging.debug('NEW SEGMENTS FULL') # logging.debug(self.segments[:last_idx+1]) - d[new_ext_segment_id] = self.ext_segments.loc[old_ext_segment_ids[0], 'name'] - - #logging.debug(old_ext_segment_ids[1:]) - #logging.debug(old_ext_segment_ids[1:]) - #logging.debug(self.ext_segments) + d[new_ext_segment_id] = self.ext_segments.loc[ + old_ext_segment_ids[0], "name" + ] + + # logging.debug(old_ext_segment_ids[1:]) + # logging.debug(old_ext_segment_ids[1:]) + # logging.debug(self.ext_segments) # self.ext_segments.drop(old_ext_segment_ids[1:], inplace=True) # for osid in old_ext_segment_ids[1:]: # kk = self.segments.ext_segment_id == osid @@ -600,7 +612,7 @@ def merge_adjacent_segments(self, max_segments=0): # logging.debug(self.segments[kk]) # logging.debug(new_ext_segment_id) # raise Exception() - #logging.debug('C',self.ext_segments) + # logging.debug('C',self.ext_segments) # if len(self.ext_segments.ext_segment_id.unique()) != len(self.ext_segments.ext_segment_id): # logging.debug(first_segment) # logging.debug(last_segment) @@ -609,19 +621,18 @@ def merge_adjacent_segments(self, max_segments=0): # self.save('pppp') count = 1 - first_idx = last_idx+1 + first_idx = last_idx + 1 last_idx = first_idx else: count += 1 last_idx = i - - ext_segment_id = [k for k,v in d.items()] - name = [v for k,v in d.items()] + ext_segment_id = [k for k, v in d.items()] + name = [v for k, v in d.items()] self.ext_segments = pd.DataFrame( - {'ext_segment_id': ext_segment_id, - 'name': name}) - #logging.debug('DICT', d) + {"ext_segment_id": ext_segment_id, "name": name} + ) + # logging.debug('DICT', d) # for k,v in d.items(): # logging.debug(k) # logging.debug(v) @@ -631,23 +642,21 @@ def merge_adjacent_segments(self, max_segments=0): # drop_index = (~self.ext_segments.ext_segment_id.isin(self.segments.ext_segment_id)) # drop_index = self.ext_segments.index[drop_index] # self.ext_segments.drop(drop_index, inplace=True) - self.ext_segments = self.ext_segments.set_index(self.ext_segments.ext_segment_id, drop=False) - #logging.debug(self.ext_segments) + self.ext_segments = self.ext_segments.set_index( + self.ext_segments.ext_segment_id, drop=False + ) + # logging.debug(self.ext_segments) # assert len(self.ext_segments.ext_segment_id.unique()) == len(self.ext_segments.ext_segment_id) # #logging.debug('E',self.ext_segments) - - def assign_names(self, ext_segments_ids, names, scores=None): assert len(names) == len(ext_segments_ids) if scores is not None: assert len(scores) == len(ext_segments_ids) - self.ext_segments.loc[ext_segments_ids, 'name'] = names - self.ext_segments.loc[ext_segments_ids, 'score'] = scores - + self.ext_segments.loc[ext_segments_ids, "name"] = names + self.ext_segments.loc[ext_segments_ids, "score"] = scores - def get_ext_segment_index(self): - d = { s:i for i,s in enumerate(self.ext_segments.ext_segment_id)} + d = {s: i for i, s in enumerate(self.ext_segments.ext_segment_id)} index = np.array([d[s] for s in self.segments.ext_segment_id], dtype=int) return index diff --git a/hyperion/utils/fold_list.py b/hyperion/utils/fold_list.py index 7efa3138..d5731f10 100644 --- a/hyperion/utils/fold_list.py +++ b/hyperion/utils/fold_list.py @@ -14,13 +14,14 @@ from .list_utils import * + class FoldList(object): """Class to contain folds for cross-validation. Attributes: key: String List with the names of the dataset/recording/i-vector folds: Int numpy array with the number of the fold of each key. - mask: Boolean numpy array to mask elements in the key + mask: Boolean numpy array to mask elements in the key """ def __init__(self, fold, key, mask=None): @@ -29,48 +30,36 @@ def __init__(self, fold, key, mask=None): self.mask = mask self.validate() - - def validate(self): - """Validates the class attributes attributes - """ + """Validates the class attributes attributes""" self.key = list2ndarray(self.key) self.fold = list2ndarray(self.fold) if self.fold.dtype != int: self.fold = self.fold.astype(int) assert len(self.key) == len(self.fold) - assert len(np.unique(self.fold[self.fold>=0])) == np.max(self.fold)+1 + assert len(np.unique(self.fold[self.fold >= 0])) == np.max(self.fold) + 1 if self.mask is not None: assert len(self.mask) == len(self.fold) - def copy(self): - """Returns a copy of the object. - """ + """Returns a copy of the object.""" return deepcopy(self) - - def __len__(self): - """Returns number of folds. - """ + """Returns number of folds.""" return self.num_folds() - @property def num_folds(self): - """Returns number of folds. - """ - return np.max(self.fold)+1 + """Returns number of folds.""" + return np.max(self.fold) + 1 - - def align_with_key(self, key, raise_missing=True): """Aligns the fold list with a given key - + Args: key: Key to align the fold and key variables of the object. - raise_missing: if True, raises exception when an element of key is + raise_missing: if True, raises exception when an element of key is not found in the object. """ f, idx = ismember(key, self.key) @@ -80,13 +69,11 @@ def align_with_key(self, key, raise_missing=True): if self.mask is not None: self.mask = self.mask[idx] else: - for i in (f==0).nonzero()[0]: - logging.warning('segment %s not found' % key[i]) + for i in (f == 0).nonzero()[0]: + logging.warning("segment %s not found" % key[i]) if raise_missing: - raise Exception('some scores were not computed') + raise Exception("some scores were not computed") - - def get_fold_idx(self, fold): """Returns a fold boolean indices @@ -104,8 +91,6 @@ def get_fold_idx(self, fold): test_idx = np.logical_and(test_idx, self.mask) return train_idx, test_idx - - def get_fold(self, fold): """Returns a fold keys @@ -120,8 +105,6 @@ def get_fold(self, fold): train_idx, test_idx = self.get_fold_idx(fold) return self.key[train_idx], self.key[test_idx] - - def __getitem__(self, fold): """Returns a fold keys @@ -135,22 +118,19 @@ def __getitem__(self, fold): return self.get_fold(fold) - - - def save(self, file_path, sep=' '): + def save(self, file_path, sep=" "): """Saves object to txt file Args: file_path: File path sep: Separator between fold field and key field """ - with open(file_path, 'w') as f: - for f,k in zip(self.fold, self.key): - f.write('%s%s%s\n' % (f,sep,k)) - + with open(file_path, "w") as f: + for f, k in zip(self.fold, self.key): + f.write("%s%s%s\n" % (f, sep, k)) @classmethod - def load(cls, file_path, sep=' '): + def load(cls, file_path, sep=" "): """Loads object from txt file Args: @@ -161,25 +141,32 @@ def load(cls, file_path, sep=' '): FoldList object """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.rstrip().split(sep=sep, maxsplit=1) for line in f] fold = np.asarray([int(f[0]) for f in fields], dtype=int) key = np.asarray([f[1] for f in fields]) return cls(fold, key) - - @classmethod - def create(cls, segment_key, num_folds, balance_by_key=None, group_by_key=None, mask=None, shuffle=False, seed=1024): + def create( + cls, + segment_key, + num_folds, + balance_by_key=None, + group_by_key=None, + mask=None, + shuffle=False, + seed=1024, + ): """Creates a FoldList object. Args: segment_key: String List of recordings/speech segments num_folds: Number of folds that we want to obtain. - balance_by_key: String List of keys indicating a property of the segment to make all folds to + balance_by_key: String List of keys indicating a property of the segment to make all folds to have the same number of elements of each class. E.g. for language ID this would be the language of the recording. - group_by_key: String List of keys indicating a property of the segment to make all the elements + group_by_key: String List of keys indicating a property of the segment to make all the elements of the same class to be in the same fold. E.g. for language ID this would be the speaker ID of the recording. mask: Boolean numpy array to mask elements of segment_key out. @@ -190,7 +177,7 @@ def create(cls, segment_key, num_folds, balance_by_key=None, group_by_key=None, """ if shuffle: rng = np.random.RandomState(seed=seed) - + if group_by_key is None: group_by_key = segment_key @@ -200,38 +187,36 @@ def create(cls, segment_key, num_folds, balance_by_key=None, group_by_key=None, _, balance_by_key = np.unique(balance_by_key, return_inverse=True) if mask is not None: - balance_by_key[mask==False] = -1 - - folds = - np.ones((len(segment_key),), dtype=int) - + balance_by_key[mask == False] = -1 + + folds = -np.ones((len(segment_key),), dtype=int) + num_classes = np.max(balance_by_key) + 1 for i in range(num_classes): - + idx_i = (balance_by_key == i).nonzero()[0] group_key_i = group_by_key[idx_i] _, group_key_i = np.unique(group_key_i, return_inverse=True) num_groups_i = np.max(group_key_i) + 1 - delta = float(num_groups_i)/num_folds + delta = float(num_groups_i) / num_folds if shuffle: shuffle_idx = np.arange(num_groups_i) rng.shuffle(shuffle_idx) group_key_tmp = np.zeros_like(group_key_i) for j in range(num_groups_i): - group_key_tmp[group_key_i==j] = shuffle_idx[j] + group_key_tmp[group_key_i == j] = shuffle_idx[j] group_key_i = group_key_tmp - + for j in range(num_folds): - k1 = int(np.round(j*delta)) - k2 = int(np.round((j+1)*delta)) - idx_ij = np.logical_and(group_key_i>=k1, group_key_i= k1, group_key_i < k2) idx_fold = idx_i[idx_ij] folds[idx_fold] = j if mask is None: - assert np.all(folds>=0) + assert np.all(folds >= 0) else: - assert np.all(folds[mask]>=0) + assert np.all(folds[mask] >= 0) return cls(folds, segment_key, mask) - - diff --git a/hyperion/utils/framing.py b/hyperion/utils/framing.py index 68868adc..fc090c1a 100644 --- a/hyperion/utils/framing.py +++ b/hyperion/utils/framing.py @@ -12,7 +12,7 @@ class Framing(object): """Class to create frames from signals or superframes from frame sequences. - + Attributes: frame_length: Length of the frames. frame_shift: Shift of the frames. @@ -47,7 +47,7 @@ class Framing(object): Pads with the reflection of the vector mirrored along the edge of the array. 'wrap' - Pads with the wrap of the vector along the axis. The first values + Pads with the wrap of the vector along the axis. The first values are used to pad the end and the end values are used to pad the beginning. @@ -55,28 +55,27 @@ class Framing(object): pad_side: padding side {symmetric (default), left, right}. pad_width: Number of values padded to the edges of each axis. - ((before_1, after_1), ... (before_N, after_N)) unique pad widths for each axis. - ((before, after),) yields same before and after pad for each axis. + ((before_1, after_1), ... (before_N, after_N)) unique pad widths for each axis. + ((before, after),) yields same before and after pad for each axis. (pad,) or int is a shortcut for before = after = pad width for all axes. pad_kwargs: extra arguments for numpy.pad """ - def __init__(self, frame_length, frame_shift=1, - pad_mode=None, pad_side='symmetric', **kwargs): + + def __init__( + self, frame_length, frame_shift=1, pad_mode=None, pad_side="symmetric", **kwargs + ): self.frame_length = frame_lenght self.frame_shift = frame_shift self.pad_mode = pad_mode self.pad_width = None if self.pad_mode is not None: - self.pad_width = self.create_pad_width( - pad_side, frame_length, frame_shift) + self.pad_width = self.create_pad_width(pad_side, frame_length, frame_shift) self.pad_kwargs = kwargs - - @static def create_pad_width(pad_side, frame_length, frame_shift): - """ Calculates the proper pad_with for left and rigth from the frame lengths and shift. - + """Calculates the proper pad_with for left and rigth from the frame lengths and shift. + Args: pad_side: symmetric, left, right. frame_length: Frame length. @@ -86,62 +85,56 @@ def create_pad_width(pad_side, frame_length, frame_shift): 2D tuple with left and right pad width. """ overlap = frame_length - frame_shift - if pad_side=='symmetric': - pad_width=(int(np.ceil(overlap/2)), - int(np.floor(overlap/2))) - elif pad_side=='left': - pad_width=(int(overlap), 0) - elif pad_side=='right': - pad_width=(0, int(overlap)) + if pad_side == "symmetric": + pad_width = (int(np.ceil(overlap / 2)), int(np.floor(overlap / 2))) + elif pad_side == "left": + pad_width = (int(overlap), 0) + elif pad_side == "right": + pad_width = (0, int(overlap)) else: - raise Exception('Unknown pad_side=%s' % pad_side) - + raise Exception("Unknown pad_side=%s" % pad_side) - def create_frames(self, x): """Create the frames. - + Args: x: 1D or 2D numpy array. Returns: - 2D numpy array. + 2D numpy array. If x is 1D, each output frame (row) will contain frame_length samples from x. If x is 2D, each output frame (row) will contain frame_length rows from x. - + """ if self.pad_mode is not None: - x=self.apply_padding(x) + x = self.apply_padding(x) - if x.ndim==1: - num_samples=x.shape[0] - in_dim=1 + if x.ndim == 1: + num_samples = x.shape[0] + in_dim = 1 else: - num_samples=x.shape[0] - in_dim=x.shape[1] - - num_out_frames=int(np.floor((num_samples-frame_length)/frame_shift+1)) - - vec_x=x.ravel() - out_dim=frame_length*in_dim - X=np.zeros((num_out_frames, out_dim), dtype=float_cpu()) - - start=0 - stop=out_dim - shift=in_dim*frame_shift + num_samples = x.shape[0] + in_dim = x.shape[1] + + num_out_frames = int(np.floor((num_samples - frame_length) / frame_shift + 1)) + + vec_x = x.ravel() + out_dim = frame_length * in_dim + X = np.zeros((num_out_frames, out_dim), dtype=float_cpu()) + + start = 0 + stop = out_dim + shift = in_dim * frame_shift for i in range(num_out_frames): - X[i,:]=vec_x[start:stop] - start+=shift - stop+=shift - - return X + X[i, :] = vec_x[start:stop] + start += shift + stop += shift + return X - def apply_padding(self, x): - """ Calls numpy.pad with the rigth arguments. - """ + """Calls numpy.pad with the rigth arguments.""" pad_width = self.pad_width - if x.ndim==2: - pad_width=(pad_width, (0,0)) + if x.ndim == 2: + pad_width = (pad_width, (0, 0)) return np.pad(x, pad_width, mode=self.pad_mode, **self.pad_kwargs) diff --git a/hyperion/utils/kaldi_io_funcs.py b/hyperion/utils/kaldi_io_funcs.py index 11f92ee2..03d4b1eb 100644 --- a/hyperion/utils/kaldi_io_funcs.py +++ b/hyperion/utils/kaldi_io_funcs.py @@ -9,112 +9,99 @@ def init_kaldi_output_stream(f, binary): - """Writes Kaldi Ark file binary marker. - """ + """Writes Kaldi Ark file binary marker.""" if binary: - f.write(b'\0B') + f.write(b"\0B") + - def init_kaldi_input_stream(f): - """Reads Kaldi Ark file binary marker. - """ - if peek(f, True, 2) == b'\0B': + """Reads Kaldi Ark file binary marker.""" + if peek(f, True, 2) == b"\0B": f.read(2) return True return False def check_token(token): - """Checks that token doesn't have spaces. - """ - assert token.find(' ') == -1, 'Token %s is not valid' % token + """Checks that token doesn't have spaces.""" + assert token.find(" ") == -1, "Token %s is not valid" % token def is_token(token): - """Checks if token is a valid token. - """ + """Checks if token is a valid token.""" if len(token) == 0: return False if not token.isprintable(): return False - if ' ' in token: + if " " in token: return False return True - - + + def read_token(f, binary): - """Reads next token from Ark file. - """ + """Reads next token from Ark file.""" if not binary: - while f.peek(1) == b' ': + while f.peek(1) == b" ": f.read(1) - token = b'' + token = b"" else: - token = b'' + token = b"" while 1: c = f.read(1) - if c == b' ' or c == b'': + if c == b" " or c == b"": break token += c - - return token.decode('ascii') - + + return token.decode("ascii") def write_token(f, binary, token): - """Writes token to Ark file. - """ + """Writes token to Ark file.""" check_token(token) - token = '%s ' % token + token = "%s " % token if binary: - token = token.encode('ascii') + token = token.encode("ascii") f.write(token) - def peek(f, binary, num_bytes=1): - """Peeks num_bytes from Ark file. - """ + """Peeks num_bytes from Ark file.""" if not binary: - while f.peek(1)[0] == ' ': + while f.peek(1)[0] == " ": f.read(1) p = f.peek(num_bytes)[:num_bytes] peek_bytes = len(p) if peek_bytes < num_bytes: f.read(peek_bytes) - delta_bytes = num_bytes-peek_bytes + delta_bytes = num_bytes - peek_bytes p = p + f.peek(delta_bytes)[:delta_bytes] f.seek(-peek_bytes, 1) return p - def read_int32(f, binary): - """Reads Int32 from Ark file. - """ + """Reads Int32 from Ark file.""" if binary: - size = int(struct.unpack('b', f.read(1))[0]) - assert size == 4, 'Wrong size %d' % size - val = struct.unpack(' num_rows (%d)' % - (row_offset, total_rows)) + assert row_offset <= total_rows, "row_offset (%d) > num_rows (%d)" % ( + row_offset, + total_rows, + ) total_rows -= row_offset if num_rows == 0: num_rows = total_rows else: - assert num_rows <= total_rows, ( - 'requested rows (%d) > available rows (%d)' % - (num_rows, total_rows)) + assert ( + num_rows <= total_rows + ), "requested rows (%d) > available rows (%d)" % ( + num_rows, + total_rows, + ) rows_left = total_rows - num_rows - + else: num_rows = 1 num_cols = read_int32(f, binary) - + if row_offset > 0: - f.seek(row_offset*num_cols*np.dtype(dtype).itemsize, 1) - data = f.read(num_rows*num_cols*np.dtype(dtype).itemsize) + f.seek(row_offset * num_cols * np.dtype(dtype).itemsize, 1) + data = f.read(num_rows * num_cols * np.dtype(dtype).itemsize) if rows_left > 0 and sequential_mode: - f.seek(rows_left*num_cols*np.dtype(dtype).itemsize, 1) - + f.seek(rows_left * num_cols * np.dtype(dtype).itemsize, 1) + vec = np.frombuffer(data, dtype=dtype) - + if ndim == 2: return cls(np.reshape(vec, (num_rows, num_cols))) return cls(vec) - + else: - if row_offset>0 or num_rows > 0: - raise NotImplementedError('Reading slices supported in text mode because it is inefficient') + if row_offset > 0 or num_rows > 0: + raise NotImplementedError( + "Reading slices supported in text mode because it is inefficient" + ) first_line = True rows = [] is_vector = False for line in f: if isinstance(line, bytes): - line = line.decode('ascii') - - if len(line) == 0 : - raise BadInputFormat('EOF reading matrix') # eof, should not happen! - if len(line.strip()) == 0 : continue # skip empty line + line = line.decode("ascii") + + if len(line) == 0: + raise BadInputFormat( + "EOF reading matrix" + ) # eof, should not happen! + if len(line.strip()) == 0: + continue # skip empty line arr = line.strip().split() if first_line: - if arr == '[]': - return np.array([], dtype='float32') - if arr[0] != '[': - raise ValueError('Wrong matrix format %s ' % line) + if arr == "[]": + return np.array([], dtype="float32") + if arr[0] != "[": + raise ValueError("Wrong matrix format %s " % line) first_line = False if len(arr) > 1: is_vector = True arr = arr[1:] else: continue - - if arr[-1] != ']': - rows.append(np.array(arr, dtype='float32')) # not last line - else: - rows.append(np.array(arr[:-1], dtype='float32')) # last line + + if arr[-1] != "]": + rows.append(np.array(arr, dtype="float32")) # not last line + else: + rows.append(np.array(arr[:-1], dtype="float32")) # last line mat = np.vstack(rows) if mat.shape[0] == 1 and is_vector: mat = mat.ravel() return cls(mat) - - return cls(np.array([], dtype='float32')) - - + return cls(np.array([], dtype="float32")) + def write(self, f, binary): - """ Writes matrix/vector to ark file. + """Writes matrix/vector to ark file. Args: f: Python file object. binary: True if we write in binary file and False if we write to text file. """ if binary: - t1 = 'F' if self.data.dtype == np.float32 else 'D' - t2 = 'M' if self.data.ndim == 2 else 'V' - token = t1+t2 + t1 = "F" if self.data.dtype == np.float32 else "D" + t2 = "M" if self.data.ndim == 2 else "V" + token = t1 + t2 write_token(f, binary, token) if self.data.ndim == 2: write_int32(f, binary, self.num_rows) @@ -169,22 +178,20 @@ def write(self, f, binary): f.write(self.data.tobytes()) else: if self.num_cols == 0: - f.write(' [ ]\n') + f.write(" [ ]\n") else: - f.write(' [') + f.write(" [") if self.data.ndim == 1: - f.write(' ') + f.write(" ") for j in range(self.num_cols): - f.write('%f ' % self.data[j]) + f.write("%f " % self.data[j]) else: for i in range(self.num_rows): - f.write('\n ') + f.write("\n ") for j in range(self.num_cols): - f.write('%f ' % self.data[i,j]) - f.write(']\n') + f.write("%f " % self.data[i, j]) + f.write("]\n") - - @staticmethod def read_shape(f, binary, sequential_mode=True): """Reads the shape of the current matrix/vector in the ark file. @@ -193,25 +200,29 @@ def read_shape(f, binary, sequential_mode=True): f: Python file object binary: True if we read from binary file and False if we read from text file. sequential_mode: True if we are reading the ark file sequentially and False if - we are using random access. In sequential_mode=True it moves the file pointer - to the next matrix. + we are using random access. In sequential_mode=True it moves the file pointer + to the next matrix. Returns: Tuple object with shape. """ if binary: peekval = peek(f, binary) - if peekval == b'C': + if peekval == b"C": return KaldiCompressedMatrix.read_shape(f, binary, sequential_mode) token = read_token(f, binary) - if token[0] == 'F' : dtype = 'float32' - elif token[0] == 'D': dtype = 'float64' + if token[0] == "F": + dtype = "float32" + elif token[0] == "D": + dtype = "float64" else: - ValueError('Wrong token %s ' % token) - if token[1] == 'V' : ndim = 1 - elif token[1] == 'M': ndim = 2 + ValueError("Wrong token %s " % token) + if token[1] == "V": + ndim = 1 + elif token[1] == "M": + ndim = 2 else: - ValueError('Wrong token %s ' % token) + ValueError("Wrong token %s " % token) if ndim == 2: num_rows = read_int32(f, binary) @@ -219,7 +230,7 @@ def read_shape(f, binary, sequential_mode=True): num_rows = 1 num_cols = read_int32(f, binary) if sequential_mode: - f.seek(num_rows*num_cols*np.dtype(dtype).itemsize, 1) + f.seek(num_rows * num_cols * np.dtype(dtype).itemsize, 1) if ndim == 1: return (num_cols,) @@ -229,39 +240,42 @@ def read_shape(f, binary, sequential_mode=True): matrix = KaldiMatrix.read(f, binary, sequential_mode=sequential_mode) return matrix.data.shape - - -compression_methods = ['auto', - 'speech-feat', - '2byte-auto', - '2byte-signed-integer', - '1byte-auto', - '1byte-unsigned-integer', - '1byte-0-1', - 'speech-feat-t'] - -compression_method2format = {'speech-feat': 1, - '2byte-auto': 2, - '2byte-signed-integer': 2, - '1byte-auto': 3, - '1byte-unsigned-integer': 3, - '1byte-0-1': 3, - 'speech-feat-t': 4} - - + +compression_methods = [ + "auto", + "speech-feat", + "2byte-auto", + "2byte-signed-integer", + "1byte-auto", + "1byte-unsigned-integer", + "1byte-0-1", + "speech-feat-t", +] + +compression_method2format = { + "speech-feat": 1, + "2byte-auto": 2, + "2byte-signed-integer": 2, + "1byte-auto": 3, + "1byte-unsigned-integer": 3, + "1byte-0-1": 3, + "speech-feat-t": 4, +} + + class KaldiCompressedMatrix(object): """Class to read/write compressed kaldi matrices. - - When compressed matrix is found in file, it calls - KaldiCompressedMatrix class automatically to uncompress. - - Attributes: - data: numpy byte array with the compressed coded matrix. - data_format: {1, 2, 3, 4} - min_value: Minimum value in the matrix. - data_range: max_value - min_value - num_rows: Number of rows in the matrix - num_columns: Number of columns in the matrix + + When compressed matrix is found in file, it calls + KaldiCompressedMatrix class automatically to uncompress. + + Attributes: + data: numpy byte array with the compressed coded matrix. + data_format: {1, 2, 3, 4} + min_value: Minimum value in the matrix. + data_range: max_value - min_value + num_rows: Number of rows in the matrix + num_columns: Number of columns in the matrix """ def __init__(self, data=None): @@ -271,60 +285,72 @@ def __init__(self, data=None): self.data_range = 0 self.num_rows = 0 self.num_cols = 0 - + if data is not None: self._unpack_header() # self.col_headers = col_headers - - def get_data_attrs(self): """ Returns: Coded matrix values in 2D format. Dictionary object with data attributes: data_format, min_value, data_range, percentiles. """ - attrs = {'data_format': self.data_format, - 'min_value': self.min_value, - 'data_range': self.data_range} + attrs = { + "data_format": self.data_format, + "min_value": self.min_value, + "data_range": self.data_range, + } header_offset = 20 if self.data_format == 1 or self.data_format == 4: - data_offset = header_offset+self.num_cols*8 + data_offset = header_offset + self.num_cols * 8 p = np.frombuffer(self.data[header_offset:data_offset], dtype=np.uint16) - attrs['perc'] = p - data = np.reshape(np.frombuffer(self.data[data_offset:], dtype=np.uint8), - (self.num_cols, self.num_rows)).transpose().copy() + attrs["perc"] = p + data = ( + np.reshape( + np.frombuffer(self.data[data_offset:], dtype=np.uint8), + (self.num_cols, self.num_rows), + ) + .transpose() + .copy() + ) elif self.data_format == 2: - data = np.reshape(np.frombuffer(self.data[header_offset:], dtype=np.uint16), - (self.num_rows, self.num_cols)) + data = np.reshape( + np.frombuffer(self.data[header_offset:], dtype=np.uint16), + (self.num_rows, self.num_cols), + ) else: - data = np.reshape(np.frombuffer(self.data[header_offset:], dtype=np.uint8), - (self.num_rows, self.num_cols)) + data = np.reshape( + np.frombuffer(self.data[header_offset:], dtype=np.uint8), + (self.num_rows, self.num_cols), + ) return data, attrs - - @classmethod def build_from_data_attrs(cls, data, attrs): - """ Builds object from coded values and attributes + """Builds object from coded values and attributes Args: data: Coded matrix values in 2D format. attrs: Dictionary object with data attributes: data_format, min_value, data_range, percentiles. - + Returns: KaldiCompressedMatrix object. """ num_rows = data.shape[0] num_cols = data.shape[1] - h = struct.pack(' 8: - method = 'speech-feat' + method = "speech-feat" else: - method = '2byte-auto' + method = "2byte-auto" self.data_format = compression_method2format[method] self.num_rows = mat.shape[0] self.num_cols = mat.shape[1] - #now compute min_val and range - if (method == 'speech-feat' or method == '2byte-auto' or - method == '1byte-auto' or method == 'speech-feat-t'): + # now compute min_val and range + if ( + method == "speech-feat" + or method == "2byte-auto" + or method == "1byte-auto" + or method == "speech-feat-t" + ): min_value = np.min(mat) max_value = np.max(mat) - if max_value == min_value : + if max_value == min_value: max_value = min_value + 1 + np.abs(min_value) - assert min_value-min_value==0 and max_value-max_value==0, 'cannot compress matrix with infs or nans' + assert ( + min_value - min_value == 0 and max_value - max_value == 0 + ), "cannot compress matrix with infs or nans" self.min_value = min_value self.data_range = max_value - min_value assert self.data_range > 0 - elif method == '2byte-signed-integer': + elif method == "2byte-signed-integer": self.min_value = -32768.0 self.data_range = 65535.0 - elif method == '1byte-unsigned-integer': + elif method == "1byte-unsigned-integer": self.min_value = 0.0 self.data_range = 255.0 - elif method == '1byte-0-1': + elif method == "1byte-0-1": self.min_value = 0.0 self.data_range = 1.0 else: @@ -405,38 +435,39 @@ def _compute_global_header(self, mat, method): header = self._pack_header() return header - - @staticmethod def _get_read_info(header, row_offset=0, num_rows=0): """Gets info needed to read the matrix from file""" - data_format, min_value, data_range, total_rows, num_cols = struct.unpack(' num_rows (%d)' % - (row_offset, total_rows)) + assert row_offset <= total_rows, "row_offset (%d) > num_rows (%d)" % ( + row_offset, + total_rows, + ) total_rows -= row_offset if num_rows == 0: num_rows = total_rows else: - assert num_rows <= total_rows, ( - 'requested rows (%d) > available rows (%d)' % - (num_rows, total_rows)) + assert ( + num_rows <= total_rows + ), "requested rows (%d) > available rows (%d)" % (num_rows, total_rows) rows_left = total_rows - num_rows - + bytes_col_header = 0 - if data_format == 1 or data_format==4: - bytes_col_header = num_cols*8 + if data_format == 1 or data_format == 4: + bytes_col_header = num_cols * 8 bytes_offset = row_offset bytes_data = num_rows bytes_left = rows_left elif data_format == 2: - bytes_offset = 2*row_offset - bytes_data = 2*num_rows - bytes_left = 2*rows_left + bytes_offset = 2 * row_offset + bytes_data = 2 * num_rows + bytes_left = 2 * rows_left else: bytes_offset = row_offset bytes_data = num_rows @@ -444,12 +475,10 @@ def _get_read_info(header, row_offset=0, num_rows=0): if make_header: header = struct.pack( - '1.0] = 1 - f[f<0.0] = 0 - return (f*65535 + 0.499).astype(np.uint16) - + f = (mat.ravel() - self.min_value) / self.data_range + f[f > 1.0] = 1 + f[f < 0.0] = 0 + return (f * 65535 + 0.499).astype(np.uint16) - def _float_to_uint8(self, mat): - f = (mat.ravel() - self.min_value)/self.data_range - f[f>1.0] = 1 - f[f<0.0] = 0 - return (f*255 + 0.499).astype(np.uint8) + f = (mat.ravel() - self.min_value) / self.data_range + f[f > 1.0] = 1 + f[f < 0.0] = 0 + return (f * 255 + 0.499).astype(np.uint8) - - def _uint16_to_float(self, byte_data): return self.min_value + self.data_range * 1.52590218966964e-05 * np.frombuffer( - byte_data, dtype=np.uint16).astype(float_cpu()) - - + byte_data, dtype=np.uint16 + ).astype(float_cpu()) def _uint8_to_float(self, byte_data): - return self.min_value + self.data_range/255.0 * np.frombuffer( - byte_data, dtype=np.uint8).astype(float_cpu()) - + return self.min_value + self.data_range / 255.0 * np.frombuffer( + byte_data, dtype=np.uint8 + ).astype(float_cpu()) - def _compute_column_header(self, v): - """ Creates the column headers for the speech-feat compression. + """Creates the column headers for the speech-feat compression. Args: v: numpy array with the column to compress. @@ -539,21 +558,33 @@ def _compute_column_header(self, v): """ one = np.uint16(1) if self.num_rows >= 5: - quarter_nr = int(self.num_rows/4) - v_sort = np.partition(v, (0, quarter_nr, 3*quarter_nr, -1)) + quarter_nr = int(self.num_rows / 4) + v_sort = np.partition(v, (0, quarter_nr, 3 * quarter_nr, -1)) perc_0 = min(self._float_to_uint16(v_sort[0])[0], np.uint16(65532)) - perc_25 = min(max(self._float_to_uint16(v_sort[quarter_nr])[0], perc_0 + one), np.uint16(65533)) - perc_75 = min(max(self._float_to_uint16(v_sort[3*quarter_nr])[0], perc_25 + one), np.uint16(65534)) + perc_25 = min( + max(self._float_to_uint16(v_sort[quarter_nr])[0], perc_0 + one), + np.uint16(65533), + ) + perc_75 = min( + max(self._float_to_uint16(v_sort[3 * quarter_nr])[0], perc_25 + one), + np.uint16(65534), + ) perc_100 = max(self._float_to_uint16(v_sort[-1])[0], perc_75 + one) else: v_sort = np.sort(v) perc_0 = min(self._float_to_uint16(v_sort[0])[0], np.uint16(65532)) if self.num_rows > 1: - perc_25 = min(max(self._float_to_uint16(v_sort[1])[0], perc_0 + one), np.uint16(65533)) + perc_25 = min( + max(self._float_to_uint16(v_sort[1])[0], perc_0 + one), + np.uint16(65533), + ) else: perc_25 = perc_0 + one if self.num_rows > 2: - perc_75 = min(max(self._float_to_uint16(v_sort[2])[0], perc_25 + one), np.uint16(65534)) + perc_75 = min( + max(self._float_to_uint16(v_sort[2])[0], perc_25 + one), + np.uint16(65534), + ) else: perc_75 = perc_25 + one @@ -561,12 +592,10 @@ def _compute_column_header(self, v): perc_100 = max(self._float_to_uint16(v_sort[3])[0], perc_75 + one) else: perc_100 = perc_75 + one - return struct.pack('64] = 64 + f = (v[idx] - p0) / (p25 - p0) + c = (f * 64 + 0.5).astype(np.int32) + c[c < 0] = 0 + c[c > 64] = 64 v_out[idx] = c idx = np.logical_and(v >= p25, v < p75) - f = (v[idx] - p25)/(p75-p25) - c = 64 + (f*128+0.5).astype(np.int32) - c[c<64] = 64 - c[c>192] = 192 + f = (v[idx] - p25) / (p75 - p25) + c = 64 + (f * 128 + 0.5).astype(np.int32) + c[c < 64] = 64 + c[c > 192] = 192 v_out[idx] = c idx = v >= p75 - f = (v[idx] - p75)/(p100-p75) - c = 192 + (f*63+0.5).astype(np.int32) - c[c<192] = 192 - c[c>255] = 255 + f = (v[idx] - p75) / (p100 - p75) + c = 192 + (f * 63 + 0.5).astype(np.int32) + c[c < 192] = 192 + c[c > 255] = 255 v_out[idx] = c return v_out.astype(np.uint8) - - @staticmethod def _char_to_float(v, p0, p25, p75, p100): """Decodes the column from bytes to float using the given percentiles""" v_in = np.frombuffer(v, dtype=np.uint8).astype(float_cpu()) v_out = np.zeros(v_in.shape, dtype=float_cpu()) idx = v_in <= 64 - v_out[idx] = p0 + (p25-p0)*v_in[idx]/64.0 - idx = np.logical_and(v_in>64, v_in<=192) - v_out[idx] = p25 + (p75-p25)*(v_in[idx] - 64)/128.0 + v_out[idx] = p0 + (p25 - p0) * v_in[idx] / 64.0 + idx = np.logical_and(v_in > 64, v_in <= 192) + v_out[idx] = p25 + (p75 - p25) * (v_in[idx] - 64) / 128.0 idx = v_in > 192 - v_out[idx] = p75 + (p100-p75)*(v_in[idx] - 192)/63.0 + v_out[idx] = p75 + (p100 - p75) * (v_in[idx] - 192) / 63.0 return v_out - - def to_ndarray(self): """Uncompresses matrix to numpy array. Returns: @@ -646,24 +667,25 @@ def to_ndarray(self): if self.data_format == 1 or self.data_format == 4: mat = np.zeros((self.num_rows, self.num_cols), dtype=float_cpu()) header_offset = 20 - data_offset = header_offset+self.num_cols*8 + data_offset = header_offset + self.num_cols * 8 for i in range(self.num_cols): - mat[:,i] = self._uncompress_column( - self.data[header_offset:header_offset+8], - self.data[data_offset:data_offset+self.num_rows]) + mat[:, i] = self._uncompress_column( + self.data[header_offset : header_offset + 8], + self.data[data_offset : data_offset + self.num_rows], + ) header_offset += 8 data_offset += self.num_rows elif self.data_format == 2: - mat = np.reshape(self._uint16_to_float(self.data[20:]), - (self.num_rows, self.num_cols)).astype(float_cpu(), copy=False) + mat = np.reshape( + self._uint16_to_float(self.data[20:]), (self.num_rows, self.num_cols) + ).astype(float_cpu(), copy=False) else: - mat = np.reshape(self._uint8_to_float(self.data[20:]), - (self.num_rows, self.num_cols)).astype(float_cpu(), copy=False) + mat = np.reshape( + self._uint8_to_float(self.data[20:]), (self.num_rows, self.num_cols) + ).astype(float_cpu(), copy=False) return mat - - def to_matrix(self): """Uncompresses matrix to KaldiMatrix object. Returns: @@ -672,9 +694,7 @@ def to_matrix(self): mat = self.to_ndarray() return KaldiMatrix(mat) - - @classmethod def read(cls, f, binary, row_offset=0, num_rows=0, sequential_mode=True): """Reads kaldi compressed matrix/vector from file. @@ -693,32 +713,42 @@ def read(cls, f, binary, row_offset=0, num_rows=0, sequential_mode=True): if binary: peekval = peek(f, binary) - if peekval == b'C': + if peekval == b"C": token = read_token(f, binary) - if token == 'CM': + if token == "CM": data_format = 1 - elif token == 'CM2': + elif token == "CM2": data_format = 2 - elif token == 'CM3': + elif token == "CM3": data_format = 3 - elif token == 'CM4': + elif token == "CM4": data_format = 4 else: - raise ValueError('Unexpected token %s' % token) - - header = struct.pack(' 0: - f.seek(bytes_offset*num_cols, 1) - data = f.read(bytes_col*num_cols) - data = np.frombuffer(data, dtype=np.uint8).reshape( - -1, num_cols).transpose().tobytes() + f.seek(bytes_offset * num_cols, 1) + data = f.read(bytes_col * num_cols) + data = ( + np.frombuffer(data, dtype=np.uint8) + .reshape(-1, num_cols) + .transpose() + .tobytes() + ) if bytes_left > 0: - f.seek(bytes_left*num_cols, 1) + f.seek(bytes_left * num_cols, 1) data = header + col_header + data else: if bytes_offset > 0: - f.seek(bytes_offset*num_cols, 1) - data = f.read(bytes_col*num_cols) + f.seek(bytes_offset * num_cols, 1) + data = f.read(bytes_col * num_cols) if bytes_left > 0: - f.seek(bytes_left*num_cols, 1) + f.seek(bytes_left * num_cols, 1) data = header + data - + return cls(data) else: matrix = KaldiMatrix.read(f, binary, row_offset, num_rows) return cls.compress(matrix) - if row_offset>0 or num_rows > 0: - raise NotImplementedError('Reading slices supported in text mode because it is inefficient') + if row_offset > 0 or num_rows > 0: + raise NotImplementedError( + "Reading slices supported in text mode because it is inefficient" + ) matrix = KaldiMatrix.read(f, binary) return cls.compress(matrix) - - def write(self, f, binary): - """ Writes matrix/vector to ark file. + """Writes matrix/vector to ark file. Args: f: Python file object. @@ -771,32 +805,34 @@ def write(self, f, binary): if binary: if self.data is not None: if self.data_format == 1: - write_token(f, binary, 'CM') + write_token(f, binary, "CM") elif self.data_format == 2: - write_token(f, binary, 'CM2') + write_token(f, binary, "CM2") elif self.data_format == 3: - write_token(f, binary, 'CM3') + write_token(f, binary, "CM3") elif self.data_format == 4: - write_token(f, binary, 'CM4') - + write_token(f, binary, "CM4") + if self.data_format == 4: header_offset = 20 - data_offset = header_offset+self.num_cols*8 - data = np.frombuffer(self.data[data_offset:], dtype=np.uint8).reshape( - self.num_cols, self.num_rows).transpose().tobytes() + data_offset = header_offset + self.num_cols * 8 + data = ( + np.frombuffer(self.data[data_offset:], dtype=np.uint8) + .reshape(self.num_cols, self.num_rows) + .transpose() + .tobytes() + ) f.write(self.data[4:data_offset]) f.write(data) else: f.write(self.data[4:]) else: - write_token(f, binary, 'CM') - header = struct.pack('thr + thr = np.max(P) - 35 + return P > thr def compute_snr(x, n, axis=-1): - P_x = 10*np.log10(np.mean(x**2, axis=axis)) - P_n = 10*np.log10(np.mean(n**2, axis=axis)) + P_x = 10 * np.log10(np.mean(x ** 2, axis=axis)) + P_n = 10 * np.log10(np.mean(n ** 2, axis=axis)) return P_x - P_n def filter_args(valid_args, kwargs): - """ Filters arguments from a dictionary + """Filters arguments from a dictionary Args: valid_args: list/tuple of valid arguments @@ -70,6 +87,4 @@ def filter_args(valid_args, kwargs): Returns Dictionary with only valid_args keys if they exists """ - return dict((k, kwargs[k]) - for k in valid_args if k in kwargs) - + return dict((k, kwargs[k]) for k in valid_args if k in kwargs) diff --git a/hyperion/utils/multithreading.py b/hyperion/utils/multithreading.py index 1386ae73..4a3ab5cc 100644 --- a/hyperion/utils/multithreading.py +++ b/hyperion/utils/multithreading.py @@ -5,29 +5,31 @@ import threading + class ThreadSafeIter: """ Takes an iterator/generator and makes it thread-safe by serializing call to the `next` method of given iterator/generator. """ + def __init__(self, it): self.it = it self.lock = threading.Lock() - + def __iter__(self): return self def __next__(self): with self.lock: return self.it.__next__() - + def next(self): with self.lock: return self.it.next() def threadsafe_generator(f): - """ + """ A decorator that takes a generator function and makes it thread-safe. """ @@ -35,4 +37,3 @@ def generator(*args, **kwargs): return ThreadSafeIter(f(*args, **kwargs)) return generator - diff --git a/hyperion/utils/plotting.py b/hyperion/utils/plotting.py index 76b69184..7b87dbee 100644 --- a/hyperion/utils/plotting.py +++ b/hyperion/utils/plotting.py @@ -7,6 +7,7 @@ import scipy.linalg as la import matplotlib + # matplotlib.use('Agg') import matplotlib.pyplot as plt import scipy.stats as stats @@ -16,122 +17,128 @@ def plot_gaussian_1D(mu, C, num_sigmas=3, num_pts=100, weight=1, **kwargs): - """ Plots a 1D Gaussian. - + """Plots a 1D Gaussian. + Args: mu: mean C: variance - num_sigmas: plots the Gaussian in the interval - (mu-num_sigmas*sigma,mu+num_sigmas*sigma), + num_sigmas: plots the Gaussian in the interval + (mu-num_sigmas*sigma,mu+num_sigmas*sigma), where sigma is the standard deviation. num_pts: number of points to plot in the interval. kwargs: extra arguments for matplotlib """ sigma = np.sqrt(C) - delta = num_sigmas*sigma - x = np.linspace(mu-delta, mu+delta, num_pts) - plt.plot(x, weight*stats.norm.pdf(x, mu, sigma), **kwargs) + delta = num_sigmas * sigma + x = np.linspace(mu - delta, mu + delta, num_pts) + plt.plot(x, weight * stats.norm.pdf(x, mu, sigma), **kwargs) + - def plot_gaussian_3D(mu, C, num_sigmas=3, num_pts=100, ax=None, **kwargs): - """ Plots a 2D Gaussian in a 3D space - + """Plots a 2D Gaussian in a 3D space + Args: mu: mean C: covariance - num_sigmas: plots the Gaussian in the interval - (mu-num_sigmas*sigma,mu+num_sigmas*sigma), + num_sigmas: plots the Gaussian in the interval + (mu-num_sigmas*sigma,mu+num_sigmas*sigma), where sigma is the standard deviation. num_pts: number of points to plot in the interval. ax: image axes where to plot it kwargs: extra arguments for matplotlib """ - assert(mu.shape[0] == 2) - assert(C.shape[0] == 2 and C.shape[1] == 2) + assert mu.shape[0] == 2 + assert C.shape[0] == 2 and C.shape[1] == 2 num_pts *= 1j invC, _, logC = invert_pdmat(C, return_logdet=True) dim = mu.shape[0] d, v = la.eigh(C) - delta = num_sigmas*np.sum(v*np.sqrt(d), axis=1) - low_lim = mu-delta - high_lim = mu+delta - X, Y = np.mgrid[low_lim[0]:high_lim[0]:num_pts, low_lim[1]:high_lim[1]:num_pts] - xy = np.vstack((X.ravel(), Y.ravel()))-mu[:, None] - z = np.exp(-0.5*dim*np.log(2*np.pi)-0.5*logC-0.5*np.sum(xy*invC(xy), axis=0)) + delta = num_sigmas * np.sum(v * np.sqrt(d), axis=1) + low_lim = mu - delta + high_lim = mu + delta + X, Y = np.mgrid[ + low_lim[0] : high_lim[0] : num_pts, low_lim[1] : high_lim[1] : num_pts + ] + xy = np.vstack((X.ravel(), Y.ravel())) - mu[:, None] + z = np.exp( + -0.5 * dim * np.log(2 * np.pi) + - 0.5 * logC + - 0.5 * np.sum(xy * invC(xy), axis=0) + ) Z = np.reshape(z, X.shape) if ax is None: fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') - + ax = fig.add_subplot(111, projection="3d") + ax.plot_surface(X, Y, Z, **kwargs) - + def plot_gaussian_ellipsoid_2D(mu, C, num_sigmas=1, num_pts=100, **kwargs): - """ Plots a 2D Gaussian in a 2D space - + """Plots a 2D Gaussian in a 2D space + Args: mu: mean C: covariance - num_sigmas: plots the Gaussian in the interval - (mu-num_sigmas*sigma,mu+num_sigmas*sigma), + num_sigmas: plots the Gaussian in the interval + (mu-num_sigmas*sigma,mu+num_sigmas*sigma), where sigma is the standard deviation. num_pts: number of points to plot in the interval. kwargs: extra arguments for matplotlib """ - assert(mu.shape[0] == 2) - assert(C.shape[0] == 2 and C.shape[1] == 2) + assert mu.shape[0] == 2 + assert C.shape[0] == 2 and C.shape[1] == 2 - t = np.linspace(0,2*np.pi,num_pts) + t = np.linspace(0, 2 * np.pi, num_pts) x = np.cos(t) y = np.sin(t) - xy = np.vstack((x,y)) - d, v = la.eigh(C) + xy = np.vstack((x, y)) + d, v = la.eigh(C) d *= num_sigmas - r = np.dot(v*d, xy)+mu[:, None] - plt.plot(r[0,:], r[1,:], **kwargs) + r = np.dot(v * d, xy) + mu[:, None] + plt.plot(r[0, :], r[1, :], **kwargs) + - def plot_gaussian_ellipsoid_3D(mu, C, num_sigmas=1, num_pts=100, ax=None, **kwargs): - """ Plots a 3D Gaussian in a 3D space - + """Plots a 3D Gaussian in a 3D space + Args: mu: mean C: covariance - num_sigmas: plots the Gaussian in the interval - (mu-num_sigmas*sigma,mu+num_sigmas*sigma), + num_sigmas: plots the Gaussian in the interval + (mu-num_sigmas*sigma,mu+num_sigmas*sigma), where sigma is the standard deviation. num_pts: number of points to plot in the interval. ax: image axes where to plot it kwargs: extra arguments for matplotlib """ - assert(mu.shape[0] == 3) - assert(C.shape[0] == 3 and C.shape[1] == 3) + assert mu.shape[0] == 3 + assert C.shape[0] == 3 and C.shape[1] == 3 num_pts *= 1j - u, v = np.mgrid[0:2*np.pi:num_pts, 0:np.pi:num_pts/2] - x = np.cos(u)*np.sin(v) - y = np.sin(u)*np.sin(v) + u, v = np.mgrid[0 : 2 * np.pi : num_pts, 0 : np.pi : num_pts / 2] + x = np.cos(u) * np.sin(v) + y = np.sin(u) * np.sin(v) z = np.cos(v) d, v = la.eigh(C) xyz = np.vstack((x.ravel(), y.ravel(), z.ravel())) - r = np.dot(v*d, xyz)+mu[:, None] - - X = np.reshape(r[0,:], u.shape) - Y = np.reshape(r[1,:], u.shape) - Z = np.reshape(r[2,:], u.shape) + r = np.dot(v * d, xyz) + mu[:, None] + + X = np.reshape(r[0, :], u.shape) + Y = np.reshape(r[1, :], u.shape) + Z = np.reshape(r[2, :], u.shape) if ax is None: fig = plt.figure() - ax = fig.add_subplot(111, projection='3d') + ax = fig.add_subplot(111, projection="3d") ax.plot_wireframe(X, Y, Z, **kwargs) # def test_plotting(): - + # mu=np.array([1, 2, 3]) # C=np.array([[2, 0.5, 0.2], [0.5, 1., 0.1], [0.2, 0.1, 0.8]]) # la.cholesky(C) @@ -161,5 +168,3 @@ def plot_gaussian_ellipsoid_3D(mu, C, num_sigmas=1, num_pts=100, ax=None, **kwar # plot_gaussian_ellipsoid_3D(mu, C, ax=ax) # plt.show() # plt.savefig('plot_gaussian_ellipsoid_3D.pdf') - - diff --git a/hyperion/utils/queues.py b/hyperion/utils/queues.py index f3775498..ad4298be 100644 --- a/hyperion/utils/queues.py +++ b/hyperion/utils/queues.py @@ -18,6 +18,7 @@ except ImportError: import Queue as queue + class SequenceQueue(object): """Base class to enqueue inputs. @@ -90,9 +91,7 @@ class OrderedQueue(SequenceQueue): scheduling: Sequential querying of datas if 'sequential', random otherwise. """ - def __init__(self, sequence, - use_multiprocessing=False, - scheduling='sequential'): + def __init__(self, sequence, use_multiprocessing=False, scheduling="sequential"): self.sequence = sequence self.use_multiprocessing = use_multiprocessing self.scheduling = scheduling @@ -127,14 +126,14 @@ def _run(self): """Function to submit request to the executor and queue the `Future` objects.""" sequence = list(range(len(self.sequence))) while True: - if self.scheduling is not 'sequential': + if self.scheduling is not "sequential": random.shuffle(sequence) for i in sequence: if self.stop_signal.is_set(): return self.queue.put( - self.executor.apply_async(get_index, - (self.sequence, i)), block=True) + self.executor.apply_async(get_index, (self.sequence, i)), block=True + ) def get(self): """Creates a generator to extract data from the queue. @@ -185,10 +184,9 @@ class GeneratorQueue(SequenceQueue): will be incremented by one for each workers. """ - def __init__(self, generator, - use_multiprocessing=False, - wait_time=0.05, - random_seed=None): + def __init__( + self, generator, use_multiprocessing=False, wait_time=0.05, random_seed=None + ): self.wait_time = wait_time self._generator = generator self._use_multiprocessing = use_multiprocessing diff --git a/hyperion/utils/rttm.py b/hyperion/utils/rttm.py index 1be82d39..2ff3a4b0 100644 --- a/hyperion/utils/rttm.py +++ b/hyperion/utils/rttm.py @@ -22,6 +22,7 @@ class RTTM(object): iter_idx: index of the current element for the iterator. unique_file_key: unique file names. """ + def __init__(self, segments, index_by_file=True): self.segments = segments self._index_by_file = index_by_file @@ -34,13 +35,23 @@ def __init__(self, segments, index_by_file=True): self.unique_file_id = self.segments.file_id.unique() self.iter_idx = 0 - @classmethod - def create(cls, segment_type, file_id, chnl=None, tbeg=None, tdur=None, - ortho=None, stype=None, name=None, conf=None, slat=None, - index_by_file=True): + def create( + cls, + segment_type, + file_id, + chnl=None, + tbeg=None, + tdur=None, + ortho=None, + stype=None, + name=None, + conf=None, + slat=None, + index_by_file=True, + ): num_segments = len(segment_type) - nans = ['' for i in range(num_segments)] + nans = ["" for i in range(num_segments)] if chnl is None: chnl = [1 for i in range(num_segments)] if tbeg is None: @@ -58,97 +69,146 @@ def create(cls, segment_type, file_id, chnl=None, tbeg=None, tdur=None, if slat is None: slat = nans - df = pd.DataFrame({'segment_type': segment_type, - 'file_id': file_id, - 'chnl': chnl, - 'tbeg': tbeg, - 'tdur': tdur, - 'ortho': ortho, - 'stype': stype, - 'name': name, - 'conf': conf, - 'slat': slat}) - - return cls(df, index_by_file) - + df = pd.DataFrame( + { + "segment_type": segment_type, + "file_id": file_id, + "chnl": chnl, + "tbeg": tbeg, + "tdur": tdur, + "ortho": ortho, + "stype": stype, + "name": name, + "conf": conf, + "slat": slat, + } + ) + return cls(df, index_by_file) @classmethod - def create_spkdiar(cls, file_id, tbeg, tdur, spk_id, conf=None, chnl=None, - index_by_file=True, prepend_file_id=False): - segment_type = ['SPEAKER'] * len(file_id) + def create_spkdiar( + cls, + file_id, + tbeg, + tdur, + spk_id, + conf=None, + chnl=None, + index_by_file=True, + prepend_file_id=False, + ): + segment_type = ["SPEAKER"] * len(file_id) spk_id = cls._make_spk_ids(spk_id, file_id, prepend_file_id) - return cls.create(segment_type, file_id, chnl, tbeg, tdur, - name=spk_id, conf=conf, index_by_file=index_by_file) - + return cls.create( + segment_type, + file_id, + chnl, + tbeg, + tdur, + name=spk_id, + conf=conf, + index_by_file=index_by_file, + ) @classmethod - def create_spkdiar_single_file(cls, file_id, tbeg, tdur, spk_id, - conf=None, chnl=None, - index_by_file=True, prepend_file_id=False): + def create_spkdiar_single_file( + cls, + file_id, + tbeg, + tdur, + spk_id, + conf=None, + chnl=None, + index_by_file=True, + prepend_file_id=False, + ): assert len(tbeg) == len(spk_id) assert len(tbeg) == len(tdur) - segment_type = ['SPEAKER'] * len(tbeg) + segment_type = ["SPEAKER"] * len(tbeg) file_id = [file_id] * len(tbeg) spk_id = cls._make_spk_ids(spk_id, file_id, prepend_file_id) - return cls.create(segment_type, file_id, chnl, tbeg, tdur, - name=spk_id, conf=conf, index_by_file=index_by_file) - + return cls.create( + segment_type, + file_id, + chnl, + tbeg, + tdur, + name=spk_id, + conf=conf, + index_by_file=index_by_file, + ) @classmethod - def create_spkdiar_from_segments(cls, segments, spk_id, conf=None, chnl=None, - index_by_file=True, prepend_file_id=False): + def create_spkdiar_from_segments( + cls, + segments, + spk_id, + conf=None, + chnl=None, + index_by_file=True, + prepend_file_id=False, + ): assert len(segments) == len(spk_id) file_id = segments.file_id tbeg = segments.tbeg tdur = segments.tend - segments.tbeg - segment_type = ['SPEAKER'] * len(file_id) + segment_type = ["SPEAKER"] * len(file_id) spk_id = cls._make_spk_ids(spk_id, file_id, prepend_file_id) - return cls.create(segment_type, file_id, chnl, tbeg, tdur, - name=spk_id, conf=conf, - index_by_file=index_by_file) - + return cls.create( + segment_type, + file_id, + chnl, + tbeg, + tdur, + name=spk_id, + conf=conf, + index_by_file=index_by_file, + ) @classmethod - def create_spkdiar_from_ext_segments(cls, ext_segments, chnl=None, index_by_file=True, prepend_file_id=False): + def create_spkdiar_from_ext_segments( + cls, ext_segments, chnl=None, index_by_file=True, prepend_file_id=False + ): file_id = ext_segments.file_id tbeg = ext_segments.tbeg tdur = ext_segments.tend - ext_segments.tbeg - segment_type = ['SPEAKER'] * len(file_id) + segment_type = ["SPEAKER"] * len(file_id) name = ext_segments.segment_names conf = ext_segments.segment_score if prepend_file_id: name = cls._prepend_file_id(name, file_id) - - return cls.create(segment_type, file_id, chnl, tbeg, tdur, - name=name, conf=conf, - index_by_file=index_by_file) - + + return cls.create( + segment_type, + file_id, + chnl, + tbeg, + tdur, + name=name, + conf=conf, + index_by_file=index_by_file, + ) @staticmethod def _make_spk_ids(spk_ids, file_id, prepend_file_id): if prepend_file_id: - return [ f + '_' + str(s) for f,s in zip(file_id,spk_ids)] - return spk_ids #[str(s) for f,s in zip(file_id,spk_ids)] - + return [f + "_" + str(s) for f, s in zip(file_id, spk_ids)] + return spk_ids # [str(s) for f,s in zip(file_id,spk_ids)] @staticmethod def _prepend_file_id(spk_ids, file_id): - return [ f + '_' + str(s) for f,s in zip(file_id,spk_ids)] + return [f + "_" + str(s) for f, s in zip(file_id, spk_ids)] - def validate(self): - """Validates the attributes of the RTTM object. - """ + """Validates the attributes of the RTTM object.""" if not self.tbeg_is_sorted(): self.sort() - @property def index_by_file(self): return self._index_by_file - @index_by_file.setter def index_by_file(self, value): self._index_by_file = value @@ -156,61 +216,57 @@ def index_by_file(self, value): self.segments.index = self.segments.file_key else: self.segments.index = self.segments.segment - - + @property def file_id(self): - return np.asarray(self.segments['file_id']) + return np.asarray(self.segments["file_id"]) - @property def tbeg(self): - return np.asarray(self.segments['tbeg']) - + return np.asarray(self.segments["tbeg"]) - @property def tdur(self): - return np.asarray(self.segments['tdur']) - - + return np.asarray(self.segments["tdur"]) + @property def name(self): - return np.asarray(self.segments['name']) - + return np.asarray(self.segments["name"]) def copy(self): """Makes a copy of the object.""" return deepcopy(self) - @property def num_files(self): return len(self.unique_file_id) - @property def total_num_spks(self): - return len(self.segments[self.segments['segment_type']=='SPEAKER'].name.unique()) - + return len( + self.segments[self.segments["segment_type"] == "SPEAKER"].name.unique() + ) @property def num_spks_per_file(self): - return {file_id: len(self.segments[(self.segments['segment_type']=='SPEAKER') & - (self.segments['file_id']==file_id)].name.unique()) - for file_id in self.unique_file_id} - + return { + file_id: len( + self.segments[ + (self.segments["segment_type"] == "SPEAKER") + & (self.segments["file_id"] == file_id) + ].name.unique() + ) + for file_id in self.unique_file_id + } @property def avg_num_spks_per_file(self): return np.mean([v for k, v in self.num_spks_per_file.items()]) - def __iter__(self): - self.iter_idx=0 + self.iter_idx = 0 return self - def __next__(self): if self.index_by_file: if self.iter_idx < len(self.unique_file_id): @@ -226,17 +282,14 @@ def __next__(self): self.iter_idx += 1 return r - def __len__(self): """Returns the number of segments in the list.""" return len(self.segments) - - + def __contains__(self, key): - """ Returns True if the segments contains the key""" + """Returns True if the segments contains the key""" return key in self.segments.segment_id - def __getitem__(self, key): """It allows to acces the de segments by file_id or segment like in a ditionary, e.g.: @@ -246,7 +299,7 @@ def __getitem__(self, key): Args: key: Segment or file key Returns: - if index_by_file is True if returns segments of a given file_id + if index_by_file is True if returns segments of a given file_id in SegmentsList format, else it returns DataFrame """ if self.index_by_file: @@ -255,23 +308,30 @@ def __getitem__(self, key): else: return self.segments.iloc[key] - - def save(self, file_path, sep=' '): + def save(self, file_path, sep=" "): """Saves segments to text file. Args: file_path: File to write the list. sep: Separator between the fields """ - self.segments[['segment_type', 'file_id', 'chnl', - 'tbeg','tdur','ortho', 'stype', - 'name', 'conf', 'slat']].to_csv( - file_path, sep=sep, float_format='%.3f', - index=False, header=False) + self.segments[ + [ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ] + ].to_csv(file_path, sep=sep, float_format="%.3f", index=False, header=False) - @classmethod - def load(cls, file_path, sep=' ', index_by_file=True): + def load(cls, file_path, sep=" ", index_by_file=True): """Loads script list from text file. Args: @@ -281,27 +341,36 @@ def load(cls, file_path, sep=' ', index_by_file=True): Returns: SegmentList object. """ - df = pd.read_csv(file_path, sep=sep, header=None, - names=['segment_type','file_id','chnl','tbeg','tdur', - 'ortho','stype','name','conf','slat']) + df = pd.read_csv( + file_path, + sep=sep, + header=None, + names=[ + "segment_type", + "file_id", + "chnl", + "tbeg", + "tdur", + "ortho", + "stype", + "name", + "conf", + "slat", + ], + ) return cls(df, index_by_file=index_by_file) - - def filter(self, filter_key, keep=True): - if not keep : - filter_key = np.setdiff1d( - np.asarray(self.segments.index), filter_key) + if not keep: + filter_key = np.setdiff1d(np.asarray(self.segments.index), filter_key) df = self.segments.loc[filter_key] return RTTM(df, index_by_file=self.index_by_file) - def split(self, idx, num_parts): key, _ = split_list(self.index, idx, num_parts) df = self.segments.loc[key] return RTTM(df, index_by_file=self.index_by_file) - @classmethod def merge(cls, rttm_list, index_by_file=True): dfs = [] @@ -310,45 +379,47 @@ def merge(cls, rttm_list, index_by_file=True): df = pd.concat(dfs) return cls(df, index_by_file=index_by_file) - def merge_adjacent_segments(self, t_margin=0): segm = self.segments segm_1 = self.segments.shift(1) delta = segm.tbeg - segm_1.tbeg - segm_1.tdur - index = ((segm.file_id == segm_1.file_id) & - (segm.segment_type == segm_1.segment_type) & - (segm.name == segm_1.name) & (delta <= t_margin)) - - for i in range(len(self.segments)-1, 0, -1): + index = ( + (segm.file_id == segm_1.file_id) + & (segm.segment_type == segm_1.segment_type) + & (segm.name == segm_1.name) + & (delta <= t_margin) + ) + + for i in range(len(self.segments) - 1, 0, -1): if index.iloc[i]: - tbeg = segm.iloc[i-1].tbeg + tbeg = segm.iloc[i - 1].tbeg tend = segm.iloc[i].tbeg + segm.iloc[i].tdur - self.segments.iloc[i-1, self.segments.columns.get_loc('tdur')] = tend - tbeg - self.segments.iloc[i, self.segments.columns.get_loc('segment_type')] = 'DROP' - - self.segments = self.segments[self.segments.segment_type != 'DROP'] + self.segments.iloc[i - 1, self.segments.columns.get_loc("tdur")] = ( + tend - tbeg + ) + self.segments.iloc[ + i, self.segments.columns.get_loc("segment_type") + ] = "DROP" + self.segments = self.segments[self.segments.segment_type != "DROP"] def __eq__(self, other): """Equal operator""" eq = self.segments.equals(other.segments) eq = eq and self.index_by_file == other.index_by_file - + return eq - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - def __cmp__(self, other): """Comparison operator""" if self.__eq__(other): return 0 return 1 - # def get_segment_names_slow(self, segment_list, sep='@', segment_type='SPEAKER'): # num_segm = len(segment_list) # names = [] @@ -374,7 +445,6 @@ def __cmp__(self, other): # return np.asarray(names), num_names - # def get_segment_names(self, segment_list, sep='@', segment_type='SPEAKER'): # num_segm = len(segment_list) # names = [] @@ -405,29 +475,37 @@ def __cmp__(self, other): # return np.asarray(names), num_names - - def get_segment_names_from_timestamps(self, file_id, timestamps, segment_type='SPEAKER', min_seg_dur=0.1): + def get_segment_names_from_timestamps( + self, file_id, timestamps, segment_type="SPEAKER", min_seg_dur=0.1 + ): num_segm = len(timestamps) names = [] num_names = np.zeros((num_segm,), dtype=int) - segments = self.segments[((self.segments['segment_type'] == segment_type) & (self.segments['file_id'] == file_id))] - tbegs = segments['tbeg'] - tends = segments['tbeg'] + segments['tdur'] + segments = self.segments[ + ( + (self.segments["segment_type"] == segment_type) + & (self.segments["file_id"] == file_id) + ) + ] + tbegs = segments["tbeg"] + tends = segments["tbeg"] + segments["tdur"] names = [] index = [] durs = [] for i in range(num_segm): tbeg_i = timestamps[i][0] tend_i = timestamps[i][1] - segments_i = segments[((tbegs <= tbeg_i) & (tends > tbeg_i)) | - ((tbegs < tend_i) & (tends >=tend_i))] + segments_i = segments[ + ((tbegs <= tbeg_i) & (tends > tbeg_i)) + | ((tbegs < tend_i) & (tends >= tend_i)) + ] # print('####') # print(tbeg_i, tend_i) # print(segments_i) if len(segments_i) == 0: continue - tbegs_i = np.asarray(segments_i['tbeg']) - tends_i = np.asarray(segments_i['tbeg'] + segments_i['tdur']) + tbegs_i = np.asarray(segments_i["tbeg"]) + tends_i = np.asarray(segments_i["tbeg"] + segments_i["tdur"]) durs_i = np.minimum(tends_i, tend_i) - np.maximum(tbeg_i, tbegs_i) # print(tbegs_i) # print(tends_i) @@ -437,7 +515,7 @@ def get_segment_names_from_timestamps(self, file_id, timestamps, segment_type='S durs_i = durs_i[dur_mask] for j in range(len(segments_i)): - names.append(segments_i.iloc[j]['name']) + names.append(segments_i.iloc[j]["name"]) durs.append(durs_i[j]) index.append(i) # print('----') @@ -448,19 +526,17 @@ def get_segment_names_from_timestamps(self, file_id, timestamps, segment_type='S index = np.asarray(index, dtype=np.int) return index, names, durs - - def get_files_with_names_diff_to_file(self, file_id, segment_type='SPEAKER'): - segments = self.segments[self.segments['segment_type'] == segment_type] - names = segments[segments['file_id'] == file_id].name.unique() + def get_files_with_names_diff_to_file(self, file_id, segment_type="SPEAKER"): + segments = self.segments[self.segments["segment_type"] == segment_type] + names = segments[segments["file_id"] == file_id].name.unique() sel_files = segments[~segments.name.isin(names)].file_id.unique() return sel_files - - - def prepend_file_id_to_name(self, segment_type='SPEAKER'): - idx = self.segments['segment_type'] == segment_type - self.segments.loc[idx, 'name'] = self.segments.loc[ - idx, ['file_id', 'name']].apply(lambda x: '_'.join(x), axis=1) + def prepend_file_id_to_name(self, segment_type="SPEAKER"): + idx = self.segments["segment_type"] == segment_type + self.segments.loc[idx, "name"] = self.segments.loc[ + idx, ["file_id", "name"] + ].apply(lambda x: "_".join(x), axis=1) # def eliminate_overlaps(self): # segm = self.segments @@ -484,26 +560,29 @@ def prepend_file_id_to_name(self, segment_type='SPEAKER'): # # logging.debug(self.segments.loc[index_1]) # self.segments.loc[index, 'tdur'] = tend[index] - tavg[index] - def get_segments_from_file(self, file_id): if self.index_by_file: segments = self.segments.loc[[file_id]] else: - segments = self.segments[self.segments['file_id'] == file_id] + segments = self.segments[self.segments["file_id"] == file_id] return segments - def get_uniq_names_for_file(self, file_id=None): segments = self.get_segments_from_file(file_id) - u_names = np.unique(segments['name']) + u_names = np.unique(segments["name"]) return u_names - def get_bin_frame_mask_for_spk( - self, file_id, name, - frame_length=0.025, frame_shift=0.01, - snip_edges=False, signal_length=None, max_frames=None): + self, + file_id, + name, + frame_length=0.025, + frame_shift=0.01, + snip_edges=False, + signal_length=None, + max_frames=None, + ): """Returns binary mask of a given speaker to select feature frames Args: @@ -519,18 +598,19 @@ def get_bin_frame_mask_for_spk( """ segments = self.get_segments_from_file(file_id) - segments = segments[(segments['segment_type']=='SPEAKER') & - (segments['name']==name)] + segments = segments[ + (segments["segment_type"] == "SPEAKER") & (segments["name"] == name) + ] tbeg = segments.tbeg tend = segments.tbeg + segments.tdur - ts = np.asarray([[tbeg[i],tend[i]] for i in len(tbeg)]) - return vad_timestamps_to_bin(ts, frame_length, frame_shift, snip_edges, - signal_length, max_frames) - + ts = np.asarray([[tbeg[i], tend[i]] for i in len(tbeg)]) + return vad_timestamps_to_bin( + ts, frame_length, frame_shift, snip_edges, signal_length, max_frames + ) def get_bin_sample_mask_for_spk( - self, file_id, name, fs, - signal_length=None, max_samples=None): + self, file_id, name, fs, signal_length=None, max_samples=None + ): """Returns binary mask of a given speaker to select waveform samples Args: @@ -543,8 +623,9 @@ def get_bin_sample_mask_for_spk( Binary mask np.array """ segments = self.get_segments_from_file(file_id) - segments = segments[(segments['segment_type']=='SPEAKER') & - (segments['name']==name)] + segments = segments[ + (segments["segment_type"] == "SPEAKER") & (segments["name"] == name) + ] tbeg = (segments.tbeg * fs).astype(dtype=np.int) tend = ((segments.tbeg + segments.tdur) * fs + 1).astype(dtype=np.int) if max_samples is None: @@ -552,17 +633,16 @@ def get_bin_sample_mask_for_spk( max_samples = tend[-1] else: max_samples = int(signal_length * fs) - - tend[tend>max_samples] = max_samples + + tend[tend > max_samples] = max_samples vad = np.zeros((max_samples,), dtype=np.bool) - for i,j in zip(tbeg, tend): + for i, j in zip(tbeg, tend): if j > i: vad[i:j] = True return vad - # def to_matrix(self, file_id, frame_shift=0.01, frame_length=0.025, snip_edges=False): # if self.index_by_file: # segments = self.segments[file_id] @@ -577,49 +657,51 @@ def get_bin_sample_mask_for_spk( # for i in range(len(u_names)): # M[tbeg[i]:tend[i], name_ids[i]] = 1 - def compute_stats(self, nbins_dur=None): # segment durations - max_dur = self.segments['tdur'].max() - min_dur = self.segments['tdur'].min() - mean_dur = self.segments['tdur'].mean() - std_dur = self.segments['tdur'].std() - median_dur = self.segments['tdur'].median() - mode_dur = self.segments['tdur'].mode() - dur_info = pd.Series([mean_dur, std_dur, median_dur, mode_dur, min_dur, max_dur], - index=['mean','std','median','model','min','max']) + max_dur = self.segments["tdur"].max() + min_dur = self.segments["tdur"].min() + mean_dur = self.segments["tdur"].mean() + std_dur = self.segments["tdur"].std() + median_dur = self.segments["tdur"].median() + mode_dur = self.segments["tdur"].mode() + dur_info = pd.Series( + [mean_dur, std_dur, median_dur, mode_dur, min_dur, max_dur], + index=["mean", "std", "median", "model", "min", "max"], + ) if nbins_dur is None: - nbins_dur = np.max(5, np.min(100, len(self.segments)/10)) - hist_dur = np.histogram(self.segments['tdur'], nbins_dur) + nbins_dur = np.max(5, np.min(100, len(self.segments) / 10)) + hist_dur = np.histogram(self.segments["tdur"], nbins_dur) - # number of speakers - total_spks = len(self.segments['name'].unique()) + total_spks = len(self.segments["name"].unique()) # overlaps # TODO return dur_info, hist_dur, total_spks - - + def to_segment_list(self): - segments = self.segments[['file_id','tbeg']].copy() - segments['tend'] = self.segments['tbeg'] + self.segments['tdur'] - segments['segment_id'] = ['%s-%07d-%07d' % (file_id, tbeg, tdur) - for file_id, tbeg, tdur in zip( - segments['file_id'], - segments['tbeg'], - segments['tend'])] + segments = self.segments[["file_id", "tbeg"]].copy() + segments["tend"] = self.segments["tbeg"] + self.segments["tdur"] + segments["segment_id"] = [ + "%s-%07d-%07d" % (file_id, tbeg, tdur) + for file_id, tbeg, tdur in zip( + segments["file_id"], segments["tbeg"], segments["tend"] + ) + ] return SegmentList(segments) - def sort(self): - self.segments.sort_values(by=['file_id', 'tbeg'], inplace=True) - + self.segments.sort_values(by=["file_id", "tbeg"], inplace=True) def tbeg_is_sorted(self): - return np.all(np.logical_or(self.tbeg[1:]-self.tbeg[:-1]>=0, - self.file_id[1:] != self.file_id[:-1])) + return np.all( + np.logical_or( + self.tbeg[1:] - self.tbeg[:-1] >= 0, + self.file_id[1:] != self.file_id[:-1], + ) + ) diff --git a/hyperion/utils/scp_list.py b/hyperion/utils/scp_list.py index 95917228..8109d905 100644 --- a/hyperion/utils/scp_list.py +++ b/hyperion/utils/scp_list.py @@ -33,10 +33,8 @@ def __init__(self, key, file_path, offset=None, range_spec=None): self.key_to_index = None self.validate() - def validate(self): - """Validates the attributes of the SCPList object. - """ + """Validates the attributes of the SCPList object.""" self.key = list2ndarray(self.key) self.file_path = list2ndarray(self.file_path, dtype=np.object) assert len(self.key) == len(self.file_path) @@ -53,30 +51,23 @@ def validate(self): assert len(self.key) == self.range_spec.shape[0] assert self.range_spec.shape[1] == 2 - - def copy(self): """Makes a copy of the object.""" return deepcopy(self) - def __len__(self): """Returns the number of elements in the list.""" return len(self.key) - - + def len(self): """Returns the number of elements in the list.""" return len(self.key) - def _create_dict(self): - """Creates dictionary that returns the position of - a segment in the list. + """Creates dictionary that returns the position of + a segment in the list. """ - self.key_to_index = OrderedDict((k,i) for i, k in enumerate(self.key)) - - + self.key_to_index = OrderedDict((k, i) for i, k in enumerate(self.key)) def get_index(self, key): """Returns the position of key in the list.""" @@ -84,24 +75,21 @@ def get_index(self, key): self._create_dict() return self.key_to_index[key] - - def __contains__(self, key): - """ Returns True if the list contains the key""" + """Returns True if the list contains the key""" if self.key_to_index is None: self._create_dict() return key in self.key_to_index - - + def __getitem__(self, key): - """It allows to acces the data in the list by key or index like in + """It allows to acces the data in the list by key or index like in a ditionary, e.g.: If input is a string key: scp = SCPList(keys, file_paths, offsets, ranges) file_path, offset, range = scp['data1'] If input is an index: key, file_path, offset, range = scp[0] - + Args: key: String key or integer index. Returns: @@ -123,16 +111,13 @@ def __getitem__(self, key): else: return self.file_path[index], offset, range_spec - def add_prefix_to_filepath(self, prefix): """Adds a prefix to the file path""" self.file_path = np.array([prefix + p for p in self.file_path]) - - def sort(self): """Sorts the list by key""" - self.key, idx = sort(self.key, return_index=True) + self.key, idx = sort(self.key, return_index=True) self.file_path = self.file_path[idx] if self.offset is not None: self.offset = self.offset[idx] @@ -140,9 +125,7 @@ def sort(self): self.range_spec = self.range_spec[idx] self.key_to_index = None - - - def save(self, file_path, sep=' ', offset_sep=':'): + def save(self, file_path, sep=" ", offset_sep=":"): """Saves script list to text file. Args: @@ -151,74 +134,79 @@ def save(self, file_path, sep=' ', offset_sep=':'): offset_sep: Separator between file_path and offset. """ if self.range_spec is None: - range_spec = ['' for k in self.key] + range_spec = ["" for k in self.key] else: range_spec = [] for r in self.range_spec: if r[0] == 0 and r[1] == 0: - range_spec.append('') + range_spec.append("") elif r[1] == 0: - range_spec.append('[%d:]' % r[0]) + range_spec.append("[%d:]" % r[0]) else: - range_spec.append('[%d:%d]' % (r[0], r[0]+r[1]-1)) - - - with open(file_path, 'w') as f: + range_spec.append("[%d:%d]" % (r[0], r[0] + r[1] - 1)) + + with open(file_path, "w") as f: if self.offset is None: for k, p, r in zip(self.key, self.file_path, range_spec): - f.write('%s%s%s%s\n' % (k, sep, p, r)) + f.write("%s%s%s%s\n" % (k, sep, p, r)) else: - for k, p, o, r in zip(self.key, self.file_path, self.offset, range_spec): - f.write('%s%s%s%s%d%s\n' % (k, sep, p, offset_sep, o, r)) - + for k, p, o, r in zip( + self.key, self.file_path, self.offset, range_spec + ): + f.write("%s%s%s%s%d%s\n" % (k, sep, p, offset_sep, o, r)) - @staticmethod def parse_script(script, offset_sep): """Parses the parts of the second field of the scp text file. - + Args: script: Second column of scp file. offset_sep: Separtor between file_path and offset. - + Returns: file_path, offset and range_spec. """ - file_range = [f.split(sep='[', maxsplit=1) for f in script] + file_range = [f.split(sep="[", maxsplit=1) for f in script] offset = None range_spec = None - + file_offset = [f[0].split(sep=offset_sep, maxsplit=1) for f in file_range] file_path = [f[0] for f in file_offset] - + if len(file_offset[0]) == 2: - offset = [int(f[1]) if len(f)==2 else -1 for f in file_offset] + offset = [int(f[1]) if len(f) == 2 else -1 for f in file_offset] if -1 in offset: - raise ValueError('Missing data position for %s' % f[0]) + raise ValueError("Missing data position for %s" % f[0]) do_range = False for f in file_range: - if len(f)==2: + if len(f) == 2: do_range = True break if do_range: - range_spec1 = [f[1].rstrip(']').split(sep=':', maxsplit=1) - if len(f)==2 else None for f in file_range] - range_spec21 = [int(f[0]) if f is not None and f[0].isdecimal() - else 0 for f in range_spec1] - range_spec22 = [int(f[1]) if f is not None and f[1].isdecimal() - else None for f in range_spec1] - range_spec = [[a, b-a+1] if b is not None else [a, 0] - for a,b in zip(range_spec21, range_spec22)] + range_spec1 = [ + f[1].rstrip("]").split(sep=":", maxsplit=1) if len(f) == 2 else None + for f in file_range + ] + range_spec21 = [ + int(f[0]) if f is not None and f[0].isdecimal() else 0 + for f in range_spec1 + ] + range_spec22 = [ + int(f[1]) if f is not None and f[1].isdecimal() else None + for f in range_spec1 + ] + range_spec = [ + [a, b - a + 1] if b is not None else [a, 0] + for a, b in zip(range_spec21, range_spec22) + ] range_spec = np.array(range_spec, dtype=np.int64) - + return file_path, offset, range_spec - - @classmethod - def load(cls, file_path, sep=' ', offset_sep=':', is_wav=False): + def load(cls, file_path, sep=" ", offset_sep=":", is_wav=False): """Loads script list from text file. Args: @@ -229,7 +217,7 @@ def load(cls, file_path, sep=' ', offset_sep=':', is_wav=False): Returns: SCPList object. """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.rstrip().split(sep=sep, maxsplit=1) for line in f] key = [f[0] for f in fields] @@ -243,15 +231,13 @@ def load(cls, file_path, sep=' ', offset_sep=':', is_wav=False): file_path, offset, range_spec = SCPList.parse_script(script, offset_sep) return cls(key, file_path, offset, range_spec) - - def split(self, idx, num_parts, group_by_key=True): - """ Splits SCPList into num_parts and return part idx. - + """Splits SCPList into num_parts and return part idx. + Args: idx: Part to return from 1 to num_parts. num_parts: Number of parts to split the list. - group_by_key: If True, all the lines with the same key + group_by_key: If True, all the lines with the same key go to the same part. Returns: @@ -269,25 +255,23 @@ def split(self, idx, num_parts, group_by_key=True): offset = self.offset[idx1] if self.range_spec is not None: range_spec = self.range_spec[idx1] - - return SCPList(key, file_path, offset, range_spec) + return SCPList(key, file_path, offset, range_spec) - @classmethod def merge(cls, scp_lists): """Merges several SCPList. - + Args: scp_lists: List of SCPLists - + Returns: SCPList object concatenation the scp_lists. """ key_list = [item.key for item in scp_lists] file_list = [item.file_path for item in scp_lists] offset_list = [item.offset for item in scp_lists] - range_list = [item.range_spec for item in scp_lists] + range_list = [item.range_spec for item in scp_lists] key = np.concatenate(tuple(key_list)) file_path = np.concatenate(tuple(file_list)) @@ -304,11 +288,9 @@ def merge(cls, scp_lists): return cls(key, file_path, offset, range_spec) - - def filter(self, filter_key, keep=True): """Removes elements from SCPList ojbect by key - + Args: filter_key: List with the keys of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -325,21 +307,19 @@ def filter(self, filter_key, keep=True): f, _ = ismember(self.key, filter_key) key = self.key[f] file_path = self.file_path[f] - + offset = None range_spec = None if self.offset is not None: offset = self.offset[f] if self.range_spec is not None: range_spec = self.range_spec[f] - - return SCPList(key, file_path, offset, range_spec) + return SCPList(key, file_path, offset, range_spec) - def filter_paths(self, filter_key, keep=True): """Removes elements of SCPList by file_path - + Args: filter_key: List with the file_path of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -349,29 +329,27 @@ def filter_paths(self, filter_key, keep=True): SCPList object. """ - if not keep : + if not keep: filter_key = np.setdiff1d(self.file_path, filter_key) f, _ = ismember(filter_key, self.file_path) - assert(np.all(f)) + assert np.all(f) f, _ = ismember(self.file_path, filter_key) key = self.key[f] file_path = self.file_path[f] - + offset = None range_spec = None if self.offset is not None: offset = self.offset[f] if self.range_spec is not None: range_spec = self.range_spec[f] - - return SCPList(key, file_path, offset, range_spec) + return SCPList(key, file_path, offset, range_spec) - def filter_index(self, index, keep=True): """Removes elements of SCPList by index - + Args: filter_key: List with the index of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -381,9 +359,8 @@ def filter_index(self, index, keep=True): SCPList object. """ - if not keep : - index = np.setdiff1d(np.arange( - len(self.key), dtype=np.int64), index) + if not keep: + index = np.setdiff1d(np.arange(len(self.key), dtype=np.int64), index) key = self.key[index] file_path = self.file_path[index] @@ -393,11 +370,9 @@ def filter_index(self, index, keep=True): offset = self.offset[index] if self.range_spec is not None: range_spec = self.range_spec[index] - + return SCPList(key, file_path, offset, range_spec) - - - + def shuffle(self, seed=1024, rng=None): """Shuffles the elements of the list. @@ -412,7 +387,7 @@ def shuffle(self, seed=1024, rng=None): rng = np.random.RandomState(seed=seed) index = np.arange(len(self.key)) rng.shuffle(index) - + self.key = self.key[index] self.file_path = self.file_path[index] if self.offset is not None: @@ -423,8 +398,6 @@ def shuffle(self, seed=1024, rng=None): self.key_to_index = None return index - - def __eq__(self, other): """Equal operator""" if self.key.size == 0 and other.key.size == 0: @@ -434,35 +407,34 @@ def __eq__(self, other): eq = eq and (self.file_path.shape == other.file_path.shape) eq = eq and np.all(self.file_path == other.file_path) - if (self.offset is None and other.offset is not None or - self.offset is not None and other.offset is None): + if ( + self.offset is None + and other.offset is not None + or self.offset is not None + and other.offset is None + ): eq = False elif self.offset is not None and other.offset is not None: eq = eq and np.all(self.offset == other.offset) - if (self.range_spec is None and other.range_spec is not None or - self.range_spec is not None and other.range_spec is None): + if ( + self.range_spec is None + and other.range_spec is not None + or self.range_spec is not None + and other.range_spec is None + ): eq = False elif self.range_spec is not None and other.range_spec is not None: eq = eq and np.all(self.range_spec == other.range_spec) - - return eq + return eq - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(other): return 0 return 1 - - - - - diff --git a/hyperion/utils/segment_list.py b/hyperion/utils/segment_list.py index a8e6407b..33b432bd 100644 --- a/hyperion/utils/segment_list.py +++ b/hyperion/utils/segment_list.py @@ -34,38 +34,29 @@ def __init__(self, segments, index_by_file=True): self.uniq_file_id = np.unique(self.segments.file_id) self.iter_idx = 0 - - @classmethod def create(cls, segment_id, file_id, tbeg, tend, index_by_file=True): - segments = pd.Dataframe({'segment_id': segment_id, - 'file_id': file_id, - 'tbeg': tbeg, - 'tend': tend}) + segments = pd.Dataframe( + {"segment_id": segment_id, "file_id": file_id, "tbeg": tbeg, "tend": tend} + ) return cls(segments, index_by_file) - - def validate(self): - """Validates the attributes of the SegmentList object. - """ + """Validates the attributes of the SegmentList object.""" # logging.debug(len(self.segments['tend']-self.segments['tbeg']>=0)) # logging.debug(len(self.segments['tbeg'][1:])) # logging.debug(len(self.segments['tbeg'][:-1])) # logging.debug(self.segments['tbeg'][1:]-self.segments['tbeg'][:-1]) # logging.debug(len(self.segments['tbeg'][1:]-self.segments['tbeg'][:-1]>=0)) # logging.debug(len(self.file_id[1:] != self.file_id[:-1])) - assert np.all(self.segments['tend']-self.segments['tbeg']>=0) + assert np.all(self.segments["tend"] - self.segments["tbeg"] >= 0) # assert np.all(np.logical_or(self.tbeg[1:]-self.tbeg[:-1]>=0, # self.file_id[1:] != self.file_id[:-1])) - - @property def index_by_file(self): return self._index_by_file - @index_by_file.setter def index_by_file(self, value): self._index_by_file = value @@ -73,49 +64,38 @@ def index_by_file(self, value): self.segments.index = self.segments.file_id else: self.segments.index = self.segments.segment_id - - - + @property def file_id(self): - return np.asarray(self.segments['file_id']) - + return np.asarray(self.segments["file_id"]) @property def segment_id(self): - return np.asarray(self.segments['segment_id']) - + return np.asarray(self.segments["segment_id"]) @property def tbeg(self): - return np.asarray(self.segments['tbeg']) + return np.asarray(self.segments["tbeg"]) - @property def tend(self): - return np.asarray(self.segments['tend']) - + return np.asarray(self.segments["tend"]) def copy(self): """Makes a copy of the object.""" return deepcopy(self) - def segments_ids_from_file(self, file_id): - """Returns segments_ids corresponding to a given file_id - """ + """Returns segments_ids corresponding to a given file_id""" if self.index_by_file: - return np.asarray(self.segments.loc[file_id]['segment_id']) - index = self.segments['file_id']==file_id - return np.asarray(self.segments.loc[index]['segment_id']) - - + return np.asarray(self.segments.loc[file_id]["segment_id"]) + index = self.segments["file_id"] == file_id + return np.asarray(self.segments.loc[index]["segment_id"]) + def __iter__(self): - self.iter_idx=0 + self.iter_idx = 0 return self - - def __next__(self): if self.index_by_file: if self.iter_idx < len(self.uniq_file_id): @@ -124,7 +104,7 @@ def __next__(self): raise StopIteration() else: if self.iter_idx < len(self.segments): - #r = self.__getitem__(self.segments['segment_id'].iloc[self.iter_idx]) + # r = self.__getitem__(self.segments['segment_id'].iloc[self.iter_idx]) r = self.segments.iloc[self.iter_idx] else: raise StopIteration() @@ -132,18 +112,14 @@ def __next__(self): self.iter_idx += 1 return r - - def __len__(self): """Returns the number of segments in the list.""" return len(self.segments) - - + def __contains__(self, key): - """ Returns True if the segments contains the key""" + """Returns True if the segments contains the key""" return key in self.segments.segment_id - def getitem_by_key(self, key): """It acceses the segments by file_id or segment_id like in a ditionary, e.g.: @@ -153,7 +129,7 @@ def getitem_by_key(self, key): Args: key: Segment or file key Returns: - if index_by_file is True if returns segments of a given file_id + if index_by_file is True if returns segments of a given file_id in SegmentsList format, else it returns DataFrame """ if self.index_by_file: @@ -162,8 +138,6 @@ def getitem_by_key(self, key): else: return self.segments.loc[key] - - def getitem_by_index(self, index): """It accesses the segments by index like in a ditionary, e.g.: @@ -173,20 +147,24 @@ def getitem_by_index(self, index): Args: key: Segment or file key Returns: - if index_by_file is True if returns segments of a given file_id + if index_by_file is True if returns segments of a given file_id in SegmentsList format, else it returns DataFrame """ if self.index_by_file: if index < len(self.uniq_file_id): return self.getitem_by_key(self.uniq_file_id[self.iter_idx]) else: - raise Exception('SegmentList error index>=num_files (%d,%d)' % (index,len(self.uniq_file_id))) + raise Exception( + "SegmentList error index>=num_files (%d,%d)" + % (index, len(self.uniq_file_id)) + ) else: if index < len(self.segments): return self.segments.iloc[index] else: - raise Exception('SegmentList error index>=num_segments (%d,%d)' % (index,len(self))) - + raise Exception( + "SegmentList error index>=num_segments (%d,%d)" % (index, len(self)) + ) def __getitem__(self, key): """It accesses the de segments by file_id or segment_id @@ -197,7 +175,7 @@ def __getitem__(self, key): Args: key: Segment or file key Returns: - if index_by_file is True if returns segments of a given file_id + if index_by_file is True if returns segments of a given file_id in SegmentsList format, else it returns DataFrame """ if isinstance(key, str): @@ -205,22 +183,19 @@ def __getitem__(self, key): else: return self.getitem_by_index(key) - - - def save(self, file_path, sep=' '): + def save(self, file_path, sep=" "): """Saves segments to text file. Args: file_path: File to write the list. sep: Separator between the fields """ - self.segments[['segment_id', 'file_id', 'tbeg', 'tend']].to_csv( - file_path, sep=sep, float_format='%.3f', index=False, header=False) + self.segments[["segment_id", "file_id", "tbeg", "tend"]].to_csv( + file_path, sep=sep, float_format="%.3f", index=False, header=False + ) - - @classmethod - def load(cls, file_path, sep=' ', index_by_file=True): + def load(cls, file_path, sep=" ", index_by_file=True): """Loads script list from text file. Args: @@ -230,21 +205,20 @@ def load(cls, file_path, sep=' ', index_by_file=True): Returns: SegmentList object. """ - df = pd.read_csv(file_path, sep=sep, header=None, - names=['segment_id','file_id','tbeg','tend']) + df = pd.read_csv( + file_path, + sep=sep, + header=None, + names=["segment_id", "file_id", "tbeg", "tend"], + ) return cls(df, index_by_file=index_by_file) - - def filter(self, filter_key, keep=True): - if not keep : - filter_key = np.setdiff1d( - np.asarray(self.segments.index), filter_key) + if not keep: + filter_key = np.setdiff1d(np.asarray(self.segments.index), filter_key) df = self.segments.loc[filter_key] return SegmentList(df, index_by_file=self.index_by_file) - - def split(self, idx, num_parts): if self.index_by_file: key, _ = split_list(self.uniq_file_id, idx, num_parts) @@ -253,8 +227,6 @@ def split(self, idx, num_parts): df = self.segments.loc[key] return SegmentList(df, index_by_file=self.index_by_file) - - @classmethod def merge(cls, segment_lists, index_by_file=True): dfs = [] @@ -263,64 +235,62 @@ def merge(cls, segment_lists, index_by_file=True): df = pd.concat(dfs) return cls(df, index_by_file=index_by_file) - - def to_bin_vad(self, key, frame_shift=10 , num_frames=None): + def to_bin_vad(self, key, frame_shift=10, num_frames=None): """Converts segments to binary VAD Args: key: Segment or file key frame_shift: frame_shift in milliseconds - num_frames: number of frames of file corresponding to key, + num_frames: number of frames of file corresponding to key, if None it takes the maximum tend for file Returns: if index_by_file is True if returns VAD joining all segments of one file else if returns VAD for one given segment """ - tbeg = np.round(np.array(self.segments.loc[key]['tbeg'], dtype=float, ndmin=1) - * 1000/frame_shift).astype(dtype=int) - tend = np.round(np.array(self.segments.loc[key]['tend'], dtype=float, ndmin=1) - * 1000/frame_shift).astype(dtype=int) + tbeg = np.round( + np.array(self.segments.loc[key]["tbeg"], dtype=float, ndmin=1) + * 1000 + / frame_shift + ).astype(dtype=int) + tend = np.round( + np.array(self.segments.loc[key]["tend"], dtype=float, ndmin=1) + * 1000 + / frame_shift + ).astype(dtype=int) if num_frames is None: if self.index_by_file: - num_frames=tend[-1] + num_frames = tend[-1] else: - file_id = self.segments.loc[key]['file_id'] - sel_idx = self.segments['file_id'] == file_id - num_frames = int(np.round(self.segments[sel_idx]['tend'].max() * 1000/self.frame_shift)) - - tend = np.minimum(num_frames-1, tend) + file_id = self.segments.loc[key]["file_id"] + sel_idx = self.segments["file_id"] == file_id + num_frames = int( + np.round( + self.segments[sel_idx]["tend"].max() * 1000 / self.frame_shift + ) + ) + + tend = np.minimum(num_frames - 1, tend) vad = np.zeros((num_frames,), dtype=int) for j in range(len(tbeg)): - vad[tbeg[j]:tend[j]+1] = 1 + vad[tbeg[j] : tend[j] + 1] = 1 return vad - - def __eq__(self, other): """Equal operator""" eq = self.segments.equals(other.segments) eq = eq and self.index_by_file == other.index_by_file - - return eq + return eq - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(other): return 0 return 1 - - - - - diff --git a/hyperion/utils/sparse_trial_key.py b/hyperion/utils/sparse_trial_key.py index cf43075e..f18dee3b 100644 --- a/hyperion/utils/sparse_trial_key.py +++ b/hyperion/utils/sparse_trial_key.py @@ -13,9 +13,10 @@ from .trial_ndx import TrialNdx from .trial_key import TrialKey + class SparseTrialKey(TrialKey): - """ Contains the trial key for speaker recognition trials. + """Contains the trial key for speaker recognition trials. Bosaris compatible Key. Attributes: @@ -30,49 +31,58 @@ class SparseTrialKey(TrialKey): seg_cond_name: String list with the names of the segment conditions. trial_cond_name: String list with the names of the trial conditions. """ - - def __init__(self, model_set=None, seg_set=None, tar=None, non=None, - model_cond=None, seg_cond = None, trial_cond=None, - model_cond_name=None, seg_cond_name=None, trial_cond_name=None): - - super().__init__( - model_set, seg_set, tar, non, - model_cond, seg_cond, trial_cond, - model_cond_name, seg_cond_name, trial_cond_name) + def __init__( + self, + model_set=None, + seg_set=None, + tar=None, + non=None, + model_cond=None, + seg_cond=None, + trial_cond=None, + model_cond_name=None, + seg_cond_name=None, + trial_cond_name=None, + ): + super().__init__( + model_set, + seg_set, + tar, + non, + model_cond, + seg_cond, + trial_cond, + model_cond_name, + seg_cond_name, + trial_cond_name, + ) def save_h5(self, file_path): raise NotImplementedError() - - def save_txt(self, file_path): """Saves object to txt file. Args: file_path: File to write the list. """ - with open(file_path, 'w') as f: + with open(file_path, "w") as f: self.tar.eliminate_zeros() self.non.eliminate_zeros() tar = self.tar.tocoo() for r, c in zip(tar.row, tar.col): - f.write('%s %s target\n' % - (self.model_set[r], self.seg_set[c])) + f.write("%s %s target\n" % (self.model_set[r], self.seg_set[c])) non = self.non.tocoo() for r, c in zip(non.row, non.col): - f.write('%s %s nontarget\n' % - (self.model_set[r], self.seg_set[c])) - - + f.write("%s %s nontarget\n" % (self.model_set[r], self.seg_set[c])) @classmethod def load_h5(cls, file_path): raise NotImplementedError() - @classmethod def load_txt(cls, file_path): """Loads object from txt file @@ -83,17 +93,19 @@ def load_txt(cls, file_path): Returns: TrialKey object. """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.split() for line in f] models = [i[0] for i in fields] segments = [i[1] for i in fields] - is_tar = [i[2] == 'target' for i in fields] + is_tar = [i[2] == "target" for i in fields] model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True) + models, return_index=True, return_inverse=True + ) seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True) - tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype='bool') - non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype='bool') + segments, return_index=True, return_inverse=True + ) + tar = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") + non = sparse.lil_matrix((len(model_set), len(seg_set)), dtype="bool") for item in zip(model_idx, seg_idx, is_tar): if item[2]: tar[item[0], item[1]] = True @@ -101,13 +113,10 @@ def load_txt(cls, file_path): non[item[0], item[1]] = True return cls(model_set, seg_set, tar.tocsr(), non.tocsr()) - @classmethod def merge(cls, key_list): raise NotImplementedError() - - def to_ndx(self): """Converts TrialKey object into TrialNdx object. @@ -117,33 +126,29 @@ def to_ndx(self): mask = np.logical_or(self.tar.toarray(), self.non.toarray()) return TrialNdx(self.model_set, self.seg_set, mask) - - def validate(self): - """Validates the attributes of the TrialKey object. - """ + """Validates the attributes of the TrialKey object.""" self.model_set = list2ndarray(self.model_set) self.seg_set = list2ndarray(self.seg_set) shape = (len(self.model_set), len(self.seg_set)) - assert(len(np.unique(self.model_set)) == shape[0]) - assert(len(np.unique(self.seg_set)) == shape[1]) - + assert len(np.unique(self.model_set)) == shape[0] + assert len(np.unique(self.seg_set)) == shape[1] if (self.tar is None) or (self.non is None): - self.tar = sparse.csr_matrix(shape, dtype='bool') - self.non = sparse.csr_matrix(shape, dtype='bool') + self.tar = sparse.csr_matrix(shape, dtype="bool") + self.non = sparse.csr_matrix(shape, dtype="bool") else: - assert(self.tar.shape == shape) - assert(self.non.shape == shape) - + assert self.tar.shape == shape + assert self.non.shape == shape + if self.model_cond is not None: - assert(self.model_cond.shape[1] == shape[0]) + assert self.model_cond.shape[1] == shape[0] if self.seg_cond is not None: - assert(self.seg_cond.shape[1] == shape[1]) + assert self.seg_cond.shape[1] == shape[1] if self.trial_cond is not None: - assert(self.trial_cond.shape[1:] == shape) - + assert self.trial_cond.shape[1:] == shape + if self.model_cond_name is not None: self.model_cond_name = list2ndarray(self.model_cond_name) if self.seg_cond_name is not None: @@ -151,18 +156,24 @@ def validate(self): if self.trial_cond_name is not None: self.trial_cond_name = list2ndarray(self.trial_cond_name) - @classmethod def from_trial_key(cls, key): tar = sparse.csr_matrix(key.tar) non = sparse.csr_matrix(key.non) tar.eliminate_zeros() non.eliminate_zeros() - return cls(key.model_set, key.seg_set, tar, non, - key.model_cond, key.seg_cond, key.trial_cond, - key.model_cond_name, key.seg_cond_name, key.trial_cond_name) - - + return cls( + key.model_set, + key.seg_set, + tar, + non, + key.model_cond, + key.seg_cond, + key.trial_cond, + key.model_cond_name, + key.seg_cond_name, + key.trial_cond_name, + ) def __eq__(self, other): """Equal operator""" @@ -175,7 +186,7 @@ def __eq__(self, other): eq = eq and np.all(self.non.data == other.non.data) eq = eq and np.all(self.tar.indices == other.tar.indices) eq = eq and np.all(self.non.indices == other.non.indices) - + eq = eq and ((self.model_cond is None) == (other.model_cond is None)) eq = eq and ((self.seg_cond is None) == (other.seg_cond is None)) eq = eq and ((self.trial_cond is None) == (other.trial_cond is None)) @@ -187,19 +198,15 @@ def __eq__(self, other): if self.trial_cond is not None: eq = eq and np.all(self.triall_cond == other.trial_cond) - eq = eq and ( - (self.model_cond_name is None) == (other.model_cond_name is None)) - eq = eq and ( - (self.seg_cond_name is None) == (other.seg_cond_name is None)) - eq = eq and ( - (self.trial_cond_name is None) == (other.trial_cond_name is None)) + eq = eq and ((self.model_cond_name is None) == (other.model_cond_name is None)) + eq = eq and ((self.seg_cond_name is None) == (other.seg_cond_name is None)) + eq = eq and ((self.trial_cond_name is None) == (other.trial_cond_name is None)) if self.model_cond_name is not None: eq = eq and np.all(self.model_cond_name == other.model_cond_name) if self.seg_cond_name is not None: eq = eq and np.all(self.seg_cond_name == other.seg_cond_name) if self.trial_cond_name is not None: - eq = eq and np.all(self.triall_cond_name == other.trial_cond_name) + eq = eq and np.all(self.triall_cond_name == other.trial_cond_name) return eq - diff --git a/hyperion/utils/sparse_trial_scores.py b/hyperion/utils/sparse_trial_scores.py index 60abd1b0..d269c629 100644 --- a/hyperion/utils/sparse_trial_scores.py +++ b/hyperion/utils/sparse_trial_scores.py @@ -10,7 +10,8 @@ import numpy as np import scipy.sparse as sparse -#import h5py + +# import h5py from ..hyp_defs import float_cpu from .list_utils import * @@ -22,9 +23,9 @@ class SparseTrialScores(TrialScores): - """ Contains the scores for the speaker recognition trials. + """Contains the scores for the speaker recognition trials. Bosaris compatible Scores. - + Attributes: model_set: List of model names. seg_set: List of test segment names. @@ -35,11 +36,9 @@ class SparseTrialScores(TrialScores): def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): super(SparseTrialScores, self).__init__(model_set, seg_set, scores, score_mask) - def save_h5(self, file_path): raise NotImplementedError() - def save_txt(self, file_path): """Saves object to txt file. @@ -48,18 +47,17 @@ def save_txt(self, file_path): """ self.score_mask.eliminate_zeros() score_mask = self.score_mask.tocoo() - with open(file_path, 'w') as f: + with open(file_path, "w") as f: for r, c in zip(score_mask.row, score_mask.col): - f.write('%s %s %f\n' % - (self.model_set[r], self.seg_set[c], - self.scores[r, c])) - + f.write( + "%s %s %f\n" + % (self.model_set[r], self.seg_set[c], self.scores[r, c]) + ) @classmethod def load_h5(cls, file_path): raise NotImplementedError() - @classmethod def load_txt(cls, file_path): """Loads object from h5 file @@ -70,36 +68,35 @@ def load_txt(cls, file_path): Returns: SparseTrialScores object. """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.split() for line in f] models = [i[0] for i in fields] segments = [i[1] for i in fields] scores_v = np.array([i[2] for i in fields]) - + model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True) + models, return_index=True, return_inverse=True + ) seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True) + segments, return_index=True, return_inverse=True + ) scores = sparse.lil_matrix((len(model_set), len(seg_set)), dtype=float_cpu()) - score_mask = sparse.lil_matrix(scores.shape, dtype='bool') + score_mask = sparse.lil_matrix(scores.shape, dtype="bool") for item in zip(model_idx, seg_idx, scores_v): score_mask[item[0], item[1]] = True scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores.tocsr(), score_mask.tocsr()) - @classmethod def merge(cls, scr_list): raise NotImplementedError() - - def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): - """Splits the TrialScores into num_model_parts x num_seg_parts and returns part + """Splits the TrialScores into num_model_parts x num_seg_parts and returns part (model_idx, seg_idx). - + Args: model_idx: Model index of the part to return from 1 to num_model_parts. num_model_parts: Number of parts to split the model list. @@ -110,47 +107,38 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): Subpart of the TrialScores """ - model_set, model_idx1 = split_list(self.model_set, - model_idx, num_model_parts) - seg_set, seg_idx1 = split_list(self.seg_set, - seg_idx, num_seg_parts) + model_set, model_idx1 = split_list(self.model_set, model_idx, num_model_parts) + seg_set, seg_idx1 = split_list(self.seg_set, seg_idx, num_seg_parts) ix = np.ix_(model_idx1, seg_idx1) scores = self.scores[ix] score_mask = self.score_mask[ix] return SparseTrialScores(model_set, seg_set, scores, score_mask) - - def validate(self): - """Validates the attributes of the TrialKey object. - """ + """Validates the attributes of the TrialKey object.""" self.model_set = list2ndarray(self.model_set) self.seg_set = list2ndarray(self.seg_set) assert len(np.unique(self.model_set)) == len(self.model_set) - assert len(np.unique(self.seg_set)) == len(self.seg_set) + assert len(np.unique(self.seg_set)) == len(self.seg_set) if self.scores is None: self.scores = sparse.csr_matrix( - (len(model_set), len(seg_set)), dtype=float_cpu()) + (len(model_set), len(seg_set)), dtype=float_cpu() + ) else: - assert (self.scores.shape == - (len(self.model_set), len(self.seg_set))) + assert self.scores.shape == (len(self.model_set), len(self.seg_set)) assert np.all(np.isfinite(self.scores.data)) if self.score_mask is None: self.score_mask = sparse.csr_matrix( - np.ones((len(self.model_set), len(self.seg_set)), - dtype='bool')) + np.ones((len(self.model_set), len(self.seg_set)), dtype="bool") + ) else: - assert (self.score_mask.shape == - (len(self.model_set), len(self.seg_set))) - + assert self.score_mask.shape == (len(self.model_set), len(self.seg_set)) - - def filter(self, model_set, seg_set, keep=True, raise_missing=True): """Removes elements from TrialScores object. - + Args: model_set: List of models to keep or remove. seg_set: List of test segments to keep or remove. @@ -162,24 +150,24 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): Filtered TrialScores object. """ - if not(keep): - model_set=np.setdiff1d(self.model_set, model_set) - seg_set=np.setdiff1d(self.model_set, seg_set) + if not (keep): + model_set = np.setdiff1d(self.model_set, model_set) + seg_set = np.setdiff1d(self.model_set, seg_set) f_mod, mod_idx = ismember(model_set, self.model_set) f_seg, seg_idx = ismember(seg_set, self.seg_set) if not (np.all(f_mod) and np.all(f_seg)): - for i in (f_mod==0).nonzero()[0]: - logging.info('model %s not found' % model_set[i]) - for i in (f_seg==0).nonzero()[0]: - logging.info('segment %s not found' % seg_set[i]) + for i in (f_mod == 0).nonzero()[0]: + logging.info("model %s not found" % model_set[i]) + for i in (f_seg == 0).nonzero()[0]: + logging.info("segment %s not found" % seg_set[i]) if raise_missing: - raise Exception('some scores were not computed') + raise Exception("some scores were not computed") - #model_set = self.model_set[mod_idx] - #set_set = self.seg_set[seg_idx] - #ix = np.ix_(mod_idx, seg_idx) + # model_set = self.model_set[mod_idx] + # set_set = self.seg_set[seg_idx] + # ix = np.ix_(mod_idx, seg_idx) # logging.info('hola1') # new_src = [[self.scores[r,c], i, j] for i,r in enumerate(mod_idx) for j,c in enumerate(seg_idx) if self.score_mask[r,c]] @@ -198,18 +186,18 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): scores = self.scores.tocoo() new_data = scores.data new_row = scores.row.copy() - for i,r in enumerate(mod_idx): + for i, r in enumerate(mod_idx): if f_mod[i] and i != r: idx = scores.row == r new_row[idx] = i new_col = scores.col.copy() - for j,c in enumerate(seg_idx): + for j, c in enumerate(seg_idx): if f_seg[j] and j != c: idx = scores.col == c new_col[idx] = j - idx = np.logical_and(new_row=0])) == np.max(self.part)+1 + assert len(np.unique(self.part[self.part >= 0])) == np.max(self.part) + 1 if self.mask is not None: assert len(self.mask) == len(self.part) - def _make_part2num(self): if self._part2num is not None: return assert self.part_names is not None - self._part2num = {p:k for k,p in enumerate(self.part_names)} + self._part2num = {p: k for k, p in enumerate(self.part_names)} - def copy(self): - """Returns a copy of the object. - """ + """Returns a copy of the object.""" return deepcopy(self) - - def __len__(self): - """Returns number of parts. - """ + """Returns number of parts.""" return self.num_parts() - - def num_parts(self): - """Returns number of parts. - """ - return np.max(self.part)+1 - + """Returns number of parts.""" + return np.max(self.part) + 1 - def align_with_key(self, key, raise_missing=True): """Aligns the part list with a given key - + Args: key: Key to align the part and key variables of the object. - raise_missing: if True, raises exception when an element of key is + raise_missing: if True, raises exception when an element of key is not found in the object. """ f, idx = ismember(key, self.key) @@ -87,13 +74,11 @@ def align_with_key(self, key, raise_missing=True): if self.mask is not None: self.mask = self.mask[idx] else: - for i in (f==0).nonzero()[0]: - logging.warning('segment %s not found' % key[i]) + for i in (f == 0).nonzero()[0]: + logging.warning("segment %s not found" % key[i]) if raise_missing: - raise Exception('some scores were not computed') - + raise Exception("some scores were not computed") - def get_part_idx(self, part): """Returns a part boolean indices @@ -105,16 +90,14 @@ def get_part_idx(self, part): test_idx: Indices of the elements used for test """ if isinstance(part, str): - self._make_part2num() - part = self._part2num[part] - + self._make_part2num() + part = self._part2num[part] + idx = self.part == part if self.mask is not None: idx = np.logical_and(idx, self.mask) return idx - - def get_part(self, part): """Returns a part keys @@ -129,8 +112,6 @@ def get_part(self, part): train_idx, test_idx = self.get_part_idx(part) return self.key[train_idx], self.key[test_idx] - - def __getitem__(self, part): """Returns a part keys @@ -144,26 +125,22 @@ def __getitem__(self, part): return self.get_part(part) - - - def save(self, file_path, sep=' '): + def save(self, file_path, sep=" "): """Saves object to txt file Args: file_path: File path sep: Separator between part field and key field """ - with open(file_path, 'w') as f: - for p,k in zip(self.part, self.key): + with open(file_path, "w") as f: + for p, k in zip(self.part, self.key): if self.part_names is None: - f.write('%s%s%d\n' % (k,sep,p)) + f.write("%s%s%d\n" % (k, sep, p)) else: - f.write('%s%s%d%s\n' % (k,sep,p,self.part_names[p])) - + f.write("%s%s%d%s\n" % (k, sep, p, self.part_names[p])) - @classmethod - def load(cls, file_path, sep=' '): + def load(cls, file_path, sep=" "): """Loads object from txt file Args: @@ -174,7 +151,7 @@ def load(cls, file_path, sep=' '): PartList object """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.rstrip().split(sep=sep, maxsplit=2) for line in f] key = np.asarray([f[0] for f in fields]) part = np.asarray([int(f[1]) for f in fields], dtype=int) @@ -184,25 +161,33 @@ def load(cls, file_path, sep=' '): part_names = np.asarray([f[2] for f in fields], dtype=int) _, part_idx = np.unique(part, return_index=True) part_names = part_names[part_idx] - - return cls(key, part, part_names=part_names) + return cls(key, part, part_names=part_names) - @classmethod - def create(cls, segment_key, part_proportions, part_names=None, balance_by_key=None, group_by_key=None, mask=None, shuffle=True, seed=1024): + def create( + cls, + segment_key, + part_proportions, + part_names=None, + balance_by_key=None, + group_by_key=None, + mask=None, + shuffle=True, + seed=1024, + ): """Creates a PartList object. Args: segment_key: String List of recordings/speech segments - part_proportions: % of data assigned to each part. + part_proportions: % of data assigned to each part. We can do as many parts as we want, not only 3. - Vector of dimension num_parts - 1, the last part is assumed to be the rest of the data. + Vector of dimension num_parts - 1, the last part is assumed to be the rest of the data. part_names: Names of the parts, by default ['train', 'val', 'eval']. - balance_by_key: String List of keys indicating a property of the segment to make all parts to + balance_by_key: String List of keys indicating a property of the segment to make all parts to have the same number of elements of each class. E.g. for language ID this would be the language of the recording. - group_by_key: String List of keys indicating a property of the segment to make all the elements + group_by_key: String List of keys indicating a property of the segment to make all the elements of the same class to be in the same part. E.g. for language ID this would be the speaker ID of the recording. mask: Boolean numpy array to mask elements of segment_key out. @@ -212,18 +197,18 @@ def create(cls, segment_key, part_proportions, part_names=None, balance_by_key=N PartList object. """ - num_parts = len(part_proportions)+1 + num_parts = len(part_proportions) + 1 cum_prop = np.hstack(([0], np.cumsum(part_proportions), [1])) - + if part_names is None: if num_parts == 3: - part_names = ['train', 'val', 'eval'] + part_names = ["train", "val", "eval"] elif num_parts == 2: - part_names = ['train', 'eval'] - + part_names = ["train", "eval"] + if shuffle: rng = np.random.RandomState(seed=seed) - + if group_by_key is None: group_by_key = segment_key @@ -233,13 +218,13 @@ def create(cls, segment_key, part_proportions, part_names=None, balance_by_key=N _, balance_by_key = np.unique(balance_by_key, return_inverse=True) if mask is not None: - balance_by_key[mask==False] = -1 - - parts = - np.ones((len(segment_key),), dtype=int) - + balance_by_key[mask == False] = -1 + + parts = -np.ones((len(segment_key),), dtype=int) + num_classes = np.max(balance_by_key) + 1 for i in range(num_classes): - + idx_i = (balance_by_key == i).nonzero()[0] group_key_i = group_by_key[idx_i] _, group_key_i = np.unique(group_key_i, return_inverse=True) @@ -250,20 +235,18 @@ def create(cls, segment_key, part_proportions, part_names=None, balance_by_key=N rng.shuffle(shuffle_idx) group_key_tmp = np.zeros_like(group_key_i) for j in range(num_groups_i): - group_key_tmp[group_key_i==j] = shuffle_idx[j] + group_key_tmp[group_key_i == j] = shuffle_idx[j] group_key_i = group_key_tmp - + for j in range(num_parts): - k1 = int(np.round(cum_prop[j]*num_groups_i)) - k2 = int(np.round(cum_prop[j+1]*num_groups_i)) - idx_ij = np.logical_and(group_key_i>=k1, group_key_i= k1, group_key_i < k2) idx_part = idx_i[idx_ij] parts[idx_part] = j if mask is None: - assert np.all(parts>=0) + assert np.all(parts >= 0) else: - assert np.all(parts[mask]>=0) + assert np.all(parts[mask] >= 0) return cls(segment_key, parts, part_names, mask) - - diff --git a/hyperion/utils/trial_key.py b/hyperion/utils/trial_key.py index 4d00f01d..b22babda 100644 --- a/hyperion/utils/trial_key.py +++ b/hyperion/utils/trial_key.py @@ -14,7 +14,7 @@ class TrialKey(object): - """ Contains the trial key for speaker recognition trials. + """Contains the trial key for speaker recognition trials. Bosaris compatible Key. Attributes: @@ -342,7 +342,7 @@ def merge(cls, key_list): def filter(self, model_set, seg_set, keep=True): """Removes elements from TrialKey object. - + Args: model_set: List of models to keep or remove. seg_set: List of test segments to keep or remove. @@ -392,9 +392,9 @@ def filter(self, model_set, seg_set, keep=True): ) def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): - """Splits the TrialKey into num_model_parts x num_seg_parts and returns part + """Splits the TrialKey into num_model_parts x num_seg_parts and returns part (model_idx, seg_idx). - + Args: model_idx: Model index of the part to return from 1 to num_model_parts. num_model_parts: Number of parts to split the model list. @@ -444,8 +444,7 @@ def to_ndx(self): return TrialNdx(self.model_set, self.seg_set, mask) def validate(self): - """Validates the attributes of the TrialKey object. - """ + """Validates the attributes of the TrialKey object.""" self.model_set = list2ndarray(self.model_set) self.seg_set = list2ndarray(self.seg_set) diff --git a/hyperion/utils/trial_ndx.py b/hyperion/utils/trial_ndx.py index 9b735155..783f39c4 100644 --- a/hyperion/utils/trial_ndx.py +++ b/hyperion/utils/trial_ndx.py @@ -13,8 +13,8 @@ class TrialNdx(object): - """ Contains the trial index to run speaker recognition trials. - Bosaris compatible Ndx. + """Contains the trial index to run speaker recognition trials. + Bosaris compatible Ndx. Attributes: model_set: List of model names. seg_set: List of test segment names. @@ -27,29 +27,25 @@ def __init__(self, model_set=None, seg_set=None, trial_mask=None): self.trial_mask = trial_mask if (model_set is not None) and (seg_set is not None): self.validate() - + @property def num_models(self): return len(self.model_set) - @property def num_tests(self): return len(self.seg_set) - - + def copy(self): """Makes a copy of the object""" return copy.deepcopy(self) - def sort(self): """Sorts the object by model and test segment names.""" self.model_set, m_idx = sort(self.model_set, return_index=True) self.seg_set, s_idx = sort(self.seg_set, return_index=True) self.trial_mask = self.trial_mask[np.ix_(m_idx, s_idx)] - - + def save(self, file_path): """Saves object to txt/h5 file. @@ -57,25 +53,23 @@ def save(self, file_path): file_path: File to write the list. """ file_base, file_ext = path.splitext(file_path) - if file_ext == '.h5' or file_ext == '.hdf5' : + if file_ext == ".h5" or file_ext == ".hdf5": self.save_h5(file_path) else: self.save_txt(file_path) - def save_h5(self, file_path): """Saves object to h5 file. Args: file_path: File to write the list. """ - with h5py.File(file_path, 'w') as f: - model_set = self.model_set.astype('S') - seg_set = self.seg_set.astype('S') - f.create_dataset('ID/row_ids', data=model_set) - f.create_dataset('ID/column_ids', data=seg_set) - f.create_dataset('trial_mask', - data=self.trial_mask.astype('uint8')) + with h5py.File(file_path, "w") as f: + model_set = self.model_set.astype("S") + seg_set = self.seg_set.astype("S") + f.create_dataset("ID/row_ids", data=model_set) + f.create_dataset("ID/column_ids", data=seg_set) + f.create_dataset("trial_mask", data=self.trial_mask.astype("uint8")) # model_set = self.model_set.astype('S') # f.create_dataset('ID/row_ids', self.model_set.shape, dtype=model_set.dtype) @@ -86,18 +80,16 @@ def save_h5(self, file_path): # f.create_dataset('trial_mask', self.trial_mask.shape, dtype='uint8') # f['trial_mask'] = self.trial_mask.astype('uint8') - def save_txt(self, file_path): """Saves object to txt file. Args: file_path: File to write the list. """ - idx=(self.trial_mask.T == True).nonzero() - with open(file_path, 'w') as f: + idx = (self.trial_mask.T == True).nonzero() + with open(file_path, "w") as f: for item in zip(idx[0], idx[1]): - f.write('%s %s\n' % (self.model_set[item[1]], self.seg_set[item[0]])) - + f.write("%s %s\n" % (self.model_set[item[1]], self.seg_set[item[0]])) @classmethod def load(cls, file_path): @@ -110,12 +102,11 @@ def load(cls, file_path): TrialNdx object. """ file_base, file_ext = path.splitext(file_path) - if file_ext == '.h5' or file_ext == '.hdf5' : + if file_ext == ".h5" or file_ext == ".hdf5": return cls.load_h5(file_path) else: return cls.load_txt(file_path) - @classmethod def load_h5(cls, file_path): """Loads object from h5 file @@ -126,13 +117,12 @@ def load_h5(cls, file_path): Returns: TrialNdx object. """ - with h5py.File(file_path, 'r') as f: - model_set = [t.decode('utf-8') for t in f['ID/row_ids']] - seg_set = [t.decode('utf-8') for t in f['ID/column_ids']] - trial_mask = np.asarray(f['trial_mask'], dtype='bool') + with h5py.File(file_path, "r") as f: + model_set = [t.decode("utf-8") for t in f["ID/row_ids"]] + seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]] + trial_mask = np.asarray(f["trial_mask"], dtype="bool") return cls(model_set, seg_set, trial_mask) - @classmethod def load_txt(cls, file_path): """Loads object from txt file @@ -143,21 +133,21 @@ def load_txt(cls, file_path): Returns: TrialNdx object. """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.split() for line in f] models = [i[0] for i in fields] segments = [i[1] for i in fields] model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True) + models, return_index=True, return_inverse=True + ) seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True) - trial_mask = np.zeros((len(model_set), len(seg_set)), dtype='bool') + segments, return_index=True, return_inverse=True + ) + trial_mask = np.zeros((len(model_set), len(seg_set)), dtype="bool") for item in zip(model_idx, seg_idx): trial_mask[item[0], item[1]] = True return cls(model_set, seg_set, trial_mask) - - @classmethod def merge(cls, ndx_list): """Merges several index objects. @@ -176,65 +166,67 @@ def merge(cls, ndx_list): ndx_i = ndx_list[i] new_model_set = np.union1d(model_set, ndx_i.model_set) new_seg_set = np.union1d(seg_set, ndx_i.seg_set) - trial_mask_1 = np.zeros((len(new_model_set), len(new_seg_set)), - dtype='bool') - _, mi_a, mi_b = intersect(new_model_set, model_set, - assume_unique=True, return_index=True) - _, si_a, si_b = intersect(new_seg_set, seg_set, - assume_unique=True, return_index=True) + trial_mask_1 = np.zeros( + (len(new_model_set), len(new_seg_set)), dtype="bool" + ) + _, mi_a, mi_b = intersect( + new_model_set, model_set, assume_unique=True, return_index=True + ) + _, si_a, si_b = intersect( + new_seg_set, seg_set, assume_unique=True, return_index=True + ) trial_mask_1[np.ix_(mi_a, si_a)] = trial_mask[np.ix_(mi_b, si_b)] - - trial_mask_2=np.zeros((len(new_model_set), len(new_seg_set)), - dtype='bool') - _, mi_a, mi_b = intersect(new_model_set, ndx_i.model_set, - assume_unique=True, return_index=True) - _, si_a, si_b = intersect(new_seg_set, ndx_i.seg_set, - assume_unique=True, return_index=True) - trial_mask_2[np.ix_(mi_a, si_a)] = ndx_i.trial_mask[ - np.ix_(mi_b, si_b)] + + trial_mask_2 = np.zeros( + (len(new_model_set), len(new_seg_set)), dtype="bool" + ) + _, mi_a, mi_b = intersect( + new_model_set, ndx_i.model_set, assume_unique=True, return_index=True + ) + _, si_a, si_b = intersect( + new_seg_set, ndx_i.seg_set, assume_unique=True, return_index=True + ) + trial_mask_2[np.ix_(mi_a, si_a)] = ndx_i.trial_mask[np.ix_(mi_b, si_b)] model_set = new_model_set seg_set = new_seg_set - trial_mask= np.logical_or(trial_mask_1, trial_mask_2) - - return cls(model_set, seg_set, trial_mask) + trial_mask = np.logical_or(trial_mask_1, trial_mask_2) + return cls(model_set, seg_set, trial_mask) - @staticmethod - def parse_eval_set(ndx, enroll, test=None, eval_set='enroll-test'): + def parse_eval_set(ndx, enroll, test=None, eval_set="enroll-test"): """Prepares the data structures required for evaluation. - + Args: ndx: TrialNdx object cotaining the trials for the main evaluation. enroll: Utt2Info where key are file_ids and second column are model names - test: Utt2Info of where key are test segments names. + test: Utt2Info of where key are test segments names. Needed in the cases enroll-coh and coh-coh. eval_test: Type of of evaluation enroll-test: main evaluation of enrollment vs test segments. enroll-coh: enrollment vs cohort segments. coh-test: cohort vs test segments. coh-coh: cohort vs cohort segments. - + Return: ndx: TrialNdx object - enroll: SCPList + enroll: SCPList """ - if eval_set == 'enroll-test': + if eval_set == "enroll-test": enroll = enroll.filter_info(ndx.model_set) - if eval_set == 'enroll-coh': + if eval_set == "enroll-coh": ndx = TrialNdx(ndx.model_set, test.file_path) enroll = enroll.filter_info(ndx.model_set) - if eval_set == 'coh-test': + if eval_set == "coh-test": ndx = TrialNdx(enroll.key, ndx.seg_set) - if eval_set == 'coh-coh': + if eval_set == "coh-coh": ndx = TrialNdx(enroll.key, test.file_path) return ndx, enroll - def filter(self, model_set, seg_set, keep=True): """Removes elements from TrialNdx object. - + Args: model_set: List of models to keep or remove. seg_set: List of test segments to keep or remove. @@ -244,7 +236,7 @@ def filter(self, model_set, seg_set, keep=True): Returns: Filtered TrialNdx object. """ - if not(keep): + if not (keep): model_set = np.setdiff1d(self.model_set, model_set) seg_set = np.setdiff1d(self.seg_set, seg_set) @@ -257,12 +249,10 @@ def filter(self, model_set, seg_set, keep=True): trial_mask = self.trial_mask[np.ix_(mod_idx, seg_idx)] return TrialNdx(model_set, seg_set, trial_mask) - - def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): - """Splits the TrialNdx into num_model_parts x num_seg_parts and returns part + """Splits the TrialNdx into num_model_parts x num_seg_parts and returns part (model_idx, seg_idx). - + Args: model_idx: Model index of the part to return from 1 to num_model_parts. num_model_parts: Number of parts to split the model list. @@ -272,41 +262,34 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): Returns: Subpart of the TrialNdx """ - model_set, model_idx1 = split_list(self.model_set, - model_idx, num_model_parts) - seg_set, seg_idx1 = split_list(self.seg_set, - seg_idx, num_seg_parts) - trial_mask=self.trial_mask[np.ix_(model_idx1, seg_idx1)] + model_set, model_idx1 = split_list(self.model_set, model_idx, num_model_parts) + seg_set, seg_idx1 = split_list(self.seg_set, seg_idx, num_seg_parts) + trial_mask = self.trial_mask[np.ix_(model_idx1, seg_idx1)] return TrialNdx(model_set, seg_set, trial_mask) - - def validate(self): - """Validates the attributes of the TrialKey object. - """ + """Validates the attributes of the TrialKey object.""" self.model_set = list2ndarray(self.model_set) self.seg_set = list2ndarray(self.seg_set) assert len(np.unique(self.model_set)) == len(self.model_set) assert len(np.unique(self.seg_set)) == len(self.seg_set) if self.trial_mask is None: - self.trial_mask = np.ones((len(self.model_set), len(self.seg_set)), - dtype='bool') + self.trial_mask = np.ones( + (len(self.model_set), len(self.seg_set)), dtype="bool" + ) else: - assert (self.trial_mask.shape == - (len(self.model_set), len(self.seg_set))) - - + assert self.trial_mask.shape == (len(self.model_set), len(self.seg_set)) def apply_segmentation_to_test(self, segment_list): """Splits test segment into multiple sub-segments - Useful to create ndx for spk diarization or tracking. + Useful to create ndx for spk diarization or tracking. - Args: - segment_list: ExtSegmentList object with mapping of - file_id to ext_segment_id - Returns: - New TrialNdx object with segment_ids in test instead of file_id. + Args: + segment_list: ExtSegmentList object with mapping of + file_id to ext_segment_id + Returns: + New TrialNdx object with segment_ids in test instead of file_id. """ new_segset = [] new_mask = [] @@ -314,14 +297,14 @@ def apply_segmentation_to_test(self, segment_list): file_id = self.seg_set[i] segment_ids = segment_list.ext_segment_ids_from_file(file_id) new_segset.append(segment_ids) - new_mask.append(np.repeat(self.trial_mask[:,i,None],len(segment_ids), axis=1)) + new_mask.append( + np.repeat(self.trial_mask[:, i, None], len(segment_ids), axis=1) + ) new_segset = np.concatenate(tuple(new_segset)) new_mask = np.concatenate(tuple(new_mask), axis=-1) return TrialNdx(self.model_set, new_segset, new_mask) - - - + def __eq__(self, other): """Equal operator""" eq = self.model_set.shape == other.model_set.shape @@ -331,71 +314,58 @@ def __eq__(self, other): eq = eq and np.all(self.trial_mask == other.trial_mask) return eq - - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(oher): return 0 return 1 - - def test(ndx_file='core-core_det5_ndx.h5'): + def test(ndx_file="core-core_det5_ndx.h5"): ndx1 = TrialNdx.load(ndx_file) ndx1.sort() ndx2 = ndx1.copy() - ndx2.model_set[0] = 'm1' + ndx2.model_set[0] = "m1" ndx2.trial_mask[:] = 0 - assert(np.any(ndx1.model_set != ndx2.model_set)) - assert(np.any(ndx1.trial_mask != ndx2.trial_mask)) + assert np.any(ndx1.model_set != ndx2.model_set) + assert np.any(ndx1.trial_mask != ndx2.trial_mask) - ndx2 = TrialNdx(ndx1.model_set[:10], ndx1.seg_set, - ndx1.trial_mask[:10,:]) - ndx3 = TrialNdx(ndx1.model_set[5:], ndx1.seg_set, - ndx1.trial_mask[5:,:]) + ndx2 = TrialNdx(ndx1.model_set[:10], ndx1.seg_set, ndx1.trial_mask[:10, :]) + ndx3 = TrialNdx(ndx1.model_set[5:], ndx1.seg_set, ndx1.trial_mask[5:, :]) ndx4 = TrialNdx.merge([ndx2, ndx3]) - assert(ndx1 == ndx4) + assert ndx1 == ndx4 - ndx2 = TrialNdx(ndx1.model_set, ndx1.seg_set[:10], - ndx1.trial_mask[:,:10]) - ndx3 = TrialNdx(ndx1.model_set, ndx1.seg_set[5:], - ndx1.trial_mask[:,5:]) + ndx2 = TrialNdx(ndx1.model_set, ndx1.seg_set[:10], ndx1.trial_mask[:, :10]) + ndx3 = TrialNdx(ndx1.model_set, ndx1.seg_set[5:], ndx1.trial_mask[:, 5:]) ndx4 = TrialNdx.merge([ndx2, ndx3]) - assert(ndx1 == ndx4) + assert ndx1 == ndx4 - ndx2 = TrialNdx(ndx1.model_set[:5], ndx1.seg_set[:10], - ndx1.trial_mask[:5,:10]) + ndx2 = TrialNdx(ndx1.model_set[:5], ndx1.seg_set[:10], ndx1.trial_mask[:5, :10]) ndx3 = ndx1.filter(ndx2.model_set, ndx2.seg_set, keep=True) - assert(ndx2 == ndx3) + assert ndx2 == ndx3 - num_parts=3 + num_parts = 3 ndx_list = [] for i in range(num_parts): for j in range(num_parts): - ndx_ij = ndx1.split(i+1, num_parts, j+1, num_parts) + ndx_ij = ndx1.split(i + 1, num_parts, j + 1, num_parts) ndx_list.append(ndx_ij) ndx2 = TrialNdx.merge(ndx_list) - assert(ndx1 == ndx2) + assert ndx1 == ndx2 - - file_h5 = 'test.h5' + file_h5 = "test.h5" ndx1.save(file_h5) ndx2 = TrialNdx.load(file_h5) - assert(ndx1 == ndx2) + assert ndx1 == ndx2 - file_txt = 'test.txt' + file_txt = "test.txt" ndx3.trial_mask[0, :] = True ndx3.trial_mask[:, 0] = True ndx3.save(file_txt) ndx2 = TrialNdx.load(file_txt) - assert(ndx3 == ndx2) - - + assert ndx3 == ndx2 diff --git a/hyperion/utils/trial_scores.py b/hyperion/utils/trial_scores.py index d66bbb92..19e17190 100644 --- a/hyperion/utils/trial_scores.py +++ b/hyperion/utils/trial_scores.py @@ -18,9 +18,9 @@ class TrialScores(object): - """ Contains the scores for the speaker recognition trials. + """Contains the scores for the speaker recognition trials. Bosaris compatible Scores. - + Attributes: model_set: List of model names. seg_set: List of test segment names. @@ -35,24 +35,19 @@ def __init__(self, model_set=None, seg_set=None, scores=None, score_mask=None): self.score_mask = score_mask if (model_set is not None) and (seg_set is not None): self.validate() - @property def num_models(self): return len(self.model_set) - @property def num_tests(self): return len(self.seg_set) - def copy(self): """Makes a copy of the object""" return copy.deepcopy(self) - - def sort(self): """Sorts the object by model and test segment names.""" self.model_set, m_idx = sort(self.model_set, return_index=True) @@ -60,9 +55,7 @@ def sort(self): ix = np.ix_(m_idx, s_idx) self.scores = self.scores[ix] self.score_mask = self.score_mask[ix] - - def save(self, file_path): """Saves object to txt/h5 file. @@ -70,43 +63,42 @@ def save(self, file_path): file_path: File to write the list. """ file_base, file_ext = path.splitext(file_path) - if file_ext == '.h5' or file_ext == '.hdf5' : + if file_ext == ".h5" or file_ext == ".hdf5": self.save_h5(file_path) else: self.save_txt(file_path) - def save_h5(self, file_path): """Saves object to h5 file. Args: file_path: File to write the list. """ - with h5py.File(file_path, 'w') as f: - model_set = self.model_set.astype('S') - seg_set = self.seg_set.astype('S') - f.create_dataset('ID/row_ids', data=model_set) - f.create_dataset('ID/column_ids', data=seg_set) - f.create_dataset('scores', data=self.scores) - f.create_dataset('score_mask', - data=self.score_mask.astype('uint8')) - - - + with h5py.File(file_path, "w") as f: + model_set = self.model_set.astype("S") + seg_set = self.seg_set.astype("S") + f.create_dataset("ID/row_ids", data=model_set) + f.create_dataset("ID/column_ids", data=seg_set) + f.create_dataset("scores", data=self.scores) + f.create_dataset("score_mask", data=self.score_mask.astype("uint8")) + def save_txt(self, file_path): """Saves object to txt file. Args: file_path: File to write the list. """ - idx=(self.score_mask.T == True).nonzero() - with open(file_path, 'w') as f: + idx = (self.score_mask.T == True).nonzero() + with open(file_path, "w") as f: for item in zip(idx[0], idx[1]): - f.write('%s %s %f\n' % - (self.model_set[item[1]], self.seg_set[item[0]], - self.scores[item[1], item[0]])) - - + f.write( + "%s %s %f\n" + % ( + self.model_set[item[1]], + self.seg_set[item[0]], + self.scores[item[1], item[0]], + ) + ) @classmethod def load(cls, file_path): @@ -119,13 +111,11 @@ def load(cls, file_path): TrialScores object. """ file_base, file_ext = path.splitext(file_path) - if file_ext == '.h5' or file_ext == '.hdf5' : + if file_ext == ".h5" or file_ext == ".hdf5": return cls.load_h5(file_path) else: return cls.load_txt(file_path) - - @classmethod def load_h5(cls, file_path): """Loads object from h5 file @@ -136,14 +126,13 @@ def load_h5(cls, file_path): Returns: TrialScores object. """ - with h5py.File(file_path, 'r') as f: - model_set = [t.decode('utf-8') for t in f['ID/row_ids']] - seg_set = [t.decode('utf-8') for t in f['ID/column_ids']] - scores = np.asarray(f['scores'], dtype=float_cpu()) - score_mask = np.asarray(f['score_mask'], dtype='bool') + with h5py.File(file_path, "r") as f: + model_set = [t.decode("utf-8") for t in f["ID/row_ids"]] + seg_set = [t.decode("utf-8") for t in f["ID/column_ids"]] + scores = np.asarray(f["scores"], dtype=float_cpu()) + score_mask = np.asarray(f["score_mask"], dtype="bool") return cls(model_set, seg_set, scores, score_mask) - @classmethod def load_txt(cls, file_path): """Loads object from h5 file @@ -154,25 +143,26 @@ def load_txt(cls, file_path): Returns: TrialScores object. """ - with open(file_path, 'r') as f: + with open(file_path, "r") as f: fields = [line.split() for line in f] models = [i[0] for i in fields] segments = [i[1] for i in fields] scores_v = np.array([i[2] for i in fields]) - + model_set, _, model_idx = np.unique( - models, return_index=True, return_inverse=True) + models, return_index=True, return_inverse=True + ) seg_set, _, seg_idx = np.unique( - segments, return_index=True, return_inverse=True) + segments, return_index=True, return_inverse=True + ) scores = np.zeros((len(model_set), len(seg_set))) - score_mask = np.zeros(scores.shape, dtype='bool') + score_mask = np.zeros(scores.shape, dtype="bool") for item in zip(model_idx, seg_idx, scores_v): score_mask[item[0], item[1]] = True - scores[item[0], item[1]]=item[2] + scores[item[0], item[1]] = item[2] return cls(model_set, seg_set, scores, score_mask) - @classmethod def merge(cls, scr_list): """Merges several score objects. @@ -194,43 +184,46 @@ def merge(cls, scr_list): new_seg_set = np.union1d(seg_set, scr_i.seg_set) shape = (len(new_model_set), len(new_seg_set)) - _, mi_a, mi_b = intersect(new_model_set, model_set, - assume_unique=True, return_index=True) - _, si_a, si_b = intersect(new_seg_set, seg_set, - assume_unique=True, return_index=True) + _, mi_a, mi_b = intersect( + new_model_set, model_set, assume_unique=True, return_index=True + ) + _, si_a, si_b = intersect( + new_seg_set, seg_set, assume_unique=True, return_index=True + ) ix_a = np.ix_(mi_a, si_a) ix_b = np.ix_(mi_b, si_b) scores_1 = np.zeros(shape) scores_1[ix_a] = scores[ix_b] - score_mask_1 = np.zeros(shape, dtype='bool') + score_mask_1 = np.zeros(shape, dtype="bool") score_mask_1[ix_a] = score_mask[ix_b] - - trial_mask_2=np.zeros((len(new_model_set), len(new_seg_set)), - dtype='bool') - _, mi_a, mi_b = intersect(new_model_set, scr_i.model_set, - assume_unique=True, return_index=True) - _, si_a, si_b = intersect(new_seg_set, scr_i.seg_set, - assume_unique=True, return_index=True) + + trial_mask_2 = np.zeros( + (len(new_model_set), len(new_seg_set)), dtype="bool" + ) + _, mi_a, mi_b = intersect( + new_model_set, scr_i.model_set, assume_unique=True, return_index=True + ) + _, si_a, si_b = intersect( + new_seg_set, scr_i.seg_set, assume_unique=True, return_index=True + ) ix_a = np.ix_(mi_a, si_a) ix_b = np.ix_(mi_b, si_b) - scores_2= np.zeros(shape) + scores_2 = np.zeros(shape) scores_2[ix_a] = scr_i.scores[ix_b] - score_mask_2 = np.zeros(shape, dtype='bool') + score_mask_2 = np.zeros(shape, dtype="bool") score_mask_2[ix_a] = scr_i.score_mask[ix_b] model_set = new_model_set seg_set = new_seg_set scores = scores_1 + scores_2 - assert(not(np.any(np.logical_and(score_mask_1, score_mask_2)))) - score_mask= np.logical_or(score_mask_1, score_mask_2) - + assert not (np.any(np.logical_and(score_mask_1, score_mask_2))) + score_mask = np.logical_or(score_mask_1, score_mask_2) + return cls(model_set, seg_set, scores, score_mask) - - def filter(self, model_set, seg_set, keep=True, raise_missing=True): """Removes elements from TrialScores object. - + Args: model_set: List of models to keep or remove. seg_set: List of test segments to keep or remove. @@ -242,9 +235,9 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): Filtered TrialScores object. """ - if not(keep): - model_set=np.setdiff1d(self.model_set, model_set) - seg_set=np.setdiff1d(self.model_set, seg_set) + if not (keep): + model_set = np.setdiff1d(self.model_set, model_set) + seg_set = np.setdiff1d(self.model_set, seg_set) f_mod, mod_idx = ismember(model_set, self.model_set) f_seg, seg_idx = ismember(seg_set, self.seg_set) @@ -256,12 +249,12 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): scores = self.scores[ix] score_mask = self.score_mask[ix] else: - for i in (f_mod==0).nonzero()[0]: - logging.info('model %s not found' % model_set[i]) - for i in (f_seg==0).nonzero()[0]: - logging.info('segment %s not found' % seg_set[i]) + for i in (f_mod == 0).nonzero()[0]: + logging.info("model %s not found" % model_set[i]) + for i in (f_seg == 0).nonzero()[0]: + logging.info("segment %s not found" % seg_set[i]) if raise_missing: - raise Exception('some scores were not computed') + raise Exception("some scores were not computed") scores = np.zeros((len(model_set), len(seg_set)), dtype=float_cpu()) score_mask = np.zeros(scores.shape, dtype=bool) @@ -269,15 +262,13 @@ def filter(self, model_set, seg_set, keep=True, raise_missing=True): ix2 = np.ix_(mod_idx[f_mod], seg_idx[f_seg]) scores[ix1] = self.scores[ix2] score_mask[ix1] = self.score_mask[ix2] - - return TrialScores(model_set, seg_set, scores, score_mask) + return TrialScores(model_set, seg_set, scores, score_mask) - def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): - """Splits the TrialScores into num_model_parts x num_seg_parts and returns part + """Splits the TrialScores into num_model_parts x num_seg_parts and returns part (model_idx, seg_idx). - + Args: model_idx: Model index of the part to return from 1 to num_model_parts. num_model_parts: Number of parts to split the model list. @@ -288,53 +279,47 @@ def split(self, model_idx, num_model_parts, seg_idx, num_seg_parts): Subpart of the TrialScores """ - model_set, model_idx1 = split_list(self.model_set, - model_idx, num_model_parts) - seg_set, seg_idx1 = split_list(self.seg_set, - seg_idx, num_seg_parts) + model_set, model_idx1 = split_list(self.model_set, model_idx, num_model_parts) + seg_set, seg_idx1 = split_list(self.seg_set, seg_idx, num_seg_parts) ix = np.ix_(model_idx1, seg_idx1) scores = self.scores[ix] - score_mask=self.score_mask[ix] + score_mask = self.score_mask[ix] return TrialScores(model_set, seg_set, scores, score_mask) - - def validate(self): - """Validates the attributes of the TrialScores object. - """ + """Validates the attributes of the TrialScores object.""" self.model_set = list2ndarray(self.model_set) self.seg_set = list2ndarray(self.seg_set) assert len(np.unique(self.model_set)) == len(self.model_set) - assert len(np.unique(self.seg_set)) == len(self.seg_set) + assert len(np.unique(self.seg_set)) == len(self.seg_set) if self.scores is None: self.scores = np.zeros((len(self.model_set), len(self.seg_set))) else: - assert (self.scores.shape == - (len(self.model_set), len(self.seg_set))) + assert self.scores.shape == (len(self.model_set), len(self.seg_set)) assert np.all(np.isfinite(self.scores)) if self.score_mask is None: - self.score_mask = np.ones((len(self.model_set), len(self.seg_set)), - dtype='bool') + self.score_mask = np.ones( + (len(self.model_set), len(self.seg_set)), dtype="bool" + ) else: - assert (self.score_mask.shape == - (len(self.model_set), len(self.seg_set))) + assert self.score_mask.shape == (len(self.model_set), len(self.seg_set)) - - def align_with_ndx(self, ndx, raise_missing=True): """Aligns scores, model_set and seg_set with TrialNdx or TrialKey. Args: ndx: TrialNdx or TrialKey object. - raise_missing: Raises exception if there are trials in ndx that are not + raise_missing: Raises exception if there are trials in ndx that are not in the score object. Returns: Aligned TrialScores object. """ - scr = self.filter(ndx.model_set, ndx.seg_set, keep=True, raise_missing=raise_missing) + scr = self.filter( + ndx.model_set, ndx.seg_set, keep=True, raise_missing=raise_missing + ) if isinstance(ndx, TrialNdx): mask = ndx.trial_mask else: @@ -344,22 +329,22 @@ def align_with_ndx(self, ndx, raise_missing=True): missing_trials = np.logical_and(mask, np.logical_not(scr.score_mask)) missing = np.any(missing_trials) if missing: - idx=(missing_trials == True).nonzero() - for i,j in zip(idx[0], idx[1]): - logging.info('missing-scores for %s %s' % - (scr.model_set[i], scr.seg_set[j])) + idx = (missing_trials == True).nonzero() + for i, j in zip(idx[0], idx[1]): + logging.info( + "missing-scores for %s %s" % (scr.model_set[i], scr.seg_set[j]) + ) if raise_missing: - raise Exception('some scores were not computed') + raise Exception("some scores were not computed") return scr - def get_tar_non(self, key): """Returns target and non target scores. - + Args: key: TrialKey object. - + Returns: Numpy array with target scores. Numpy array with non-target scores. @@ -371,8 +356,6 @@ def get_tar_non(self, key): non = scr.scores[non_mask] return tar, non - - def set_missing_to_value(self, ndx, val): """Aligns the scores with a TrialNdx and sets the trials with missing scores to the same value. @@ -394,8 +377,6 @@ def set_missing_to_value(self, ndx, val): scr.score_mask[mask] = True return scr - - def transform(self, f): """Applies a function to the valid scores of the object. @@ -404,9 +385,7 @@ def transform(self, f): """ mask = self.score_mask self.scores[mask] = f(self.scores[mask]) - - def __eq__(self, other): """Equal operator""" eq = self.model_set.shape == other.model_set.shape @@ -417,111 +396,126 @@ def __eq__(self, other): eq = eq and np.all(self.score_mask == other.score_mask) return eq - - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(oher): return 0 return 1 - - def test(key_file='core-core_det5_key.h5'): + def test(key_file="core-core_det5_key.h5"): key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) - scr1 = TrialScores(key.model_set, key.seg_set, - np.random.normal(size=key.tar.shape)*mask, - mask) + scr1 = TrialScores( + key.model_set, + key.seg_set, + np.random.normal(size=key.tar.shape) * mask, + mask, + ) - scr2=scr1.copy() + scr2 = scr1.copy() scr2.sort() - assert(scr2 != scr1) + assert scr2 != scr1 scr3 = scr2.align_with_ndx(key) - assert(scr1 == scr3) - + assert scr1 == scr3 + scr1.sort() scr2 = scr1.copy() - scr2.model_set[0] = 'm1' + scr2.model_set[0] = "m1" scr2.score_mask[:] = 0 - assert(np.any(scr1.model_set != scr2.model_set)) - assert(np.any(scr1.score_mask != scr2.score_mask)) - - scr2 = TrialScores(scr1.model_set[:10], scr1.seg_set, - scr1.scores[:10,:], scr1.score_mask[:10,:]) - scr3 = TrialScores(scr1.model_set[10:], scr1.seg_set, - scr1.scores[10:,:], scr1.score_mask[10:,:]) + assert np.any(scr1.model_set != scr2.model_set) + assert np.any(scr1.score_mask != scr2.score_mask) + + scr2 = TrialScores( + scr1.model_set[:10], + scr1.seg_set, + scr1.scores[:10, :], + scr1.score_mask[:10, :], + ) + scr3 = TrialScores( + scr1.model_set[10:], + scr1.seg_set, + scr1.scores[10:, :], + scr1.score_mask[10:, :], + ) scr4 = TrialScores.merge([scr2, scr3]) - assert(scr1 == scr4) - - scr2 = TrialScores(scr1.model_set, scr1.seg_set[:10], - scr1.scores[:,:10], scr1.score_mask[:,:10]) - scr3 = TrialScores(scr1.model_set, scr1.seg_set[10:], - scr1.scores[:,10:], scr1.score_mask[:,10:]) + assert scr1 == scr4 + + scr2 = TrialScores( + scr1.model_set, + scr1.seg_set[:10], + scr1.scores[:, :10], + scr1.score_mask[:, :10], + ) + scr3 = TrialScores( + scr1.model_set, + scr1.seg_set[10:], + scr1.scores[:, 10:], + scr1.score_mask[:, 10:], + ) scr4 = TrialScores.merge([scr2, scr3]) - assert(scr1 == scr4) - - scr2 = TrialScores(scr1.model_set[:5], scr1.seg_set[:10], - scr1.scores[:5,:10], scr1.score_mask[:5,:10]) + assert scr1 == scr4 + + scr2 = TrialScores( + scr1.model_set[:5], + scr1.seg_set[:10], + scr1.scores[:5, :10], + scr1.score_mask[:5, :10], + ) scr3 = scr1.filter(scr2.model_set, scr2.seg_set, keep=True) - assert(scr2 == scr3) + assert scr2 == scr3 - num_parts=3 + num_parts = 3 scr_list = [] for i in range(num_parts): for j in range(num_parts): - scr_ij = scr1.split(i+1, num_parts, j+1, num_parts) + scr_ij = scr1.split(i + 1, num_parts, j + 1, num_parts) scr_list.append(scr_ij) scr2 = TrialScores.merge(scr_list) - assert(scr1 == scr2) + assert scr1 == scr2 - f = lambda x: 3*x + 1 + f = lambda x: 3 * x + 1 scr2 = scr1.copy() - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = False + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = False scr4 = scr2.copy() scr4.transform(f) - assert(scr4.scores[0,0] == 3*scr1.scores[0,0] + 1) - assert(scr4.scores[0,1] == scr1.scores[0,1]) + assert scr4.scores[0, 0] == 3 * scr1.scores[0, 0] + 1 + assert scr4.scores[0, 1] == scr1.scores[0, 1] scr2 = scr1.align_with_ndx(key) key2 = key.copy() scr2.score_mask[:] = False - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = True - scr2.scores[0,0] = 1 - scr2.scores[0,1] = -1 + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = True + scr2.scores[0, 0] = 1 + scr2.scores[0, 1] = -1 key2.tar[:] = False key2.non[:] = False - key2.tar[0,0] = True - key2.non[0,1] = True + key2.tar[0, 0] = True + key2.non[0, 1] = True [tar, non] = scr2.get_tar_non(key2) - assert(np.all(tar==[1])) - assert(np.all(non==[-1])) + assert np.all(tar == [1]) + assert np.all(non == [-1]) - scr2.score_mask[0,0] = False + scr2.score_mask[0, 0] = False scr4 = scr2.set_missing_to_value(key2, -10) - assert(scr4.scores[0,0] == -10) - - file_h5 = 'test.h5' + assert scr4.scores[0, 0] == -10 + + file_h5 = "test.h5" scr1.save(file_h5) scr2 = TrialScores.load(file_h5) - assert(scr1 == scr2) + assert scr1 == scr2 - file_txt = 'test.txt' + file_txt = "test.txt" scr3.score_mask[0, :] = True scr3.score_mask[:, 0] = True scr3.save(file_txt) scr2 = TrialScores.load(file_txt) - assert(scr3 == scr2) - - - + assert scr3 == scr2 diff --git a/hyperion/utils/trial_stats.py b/hyperion/utils/trial_stats.py index 92aea572..229bad3c 100644 --- a/hyperion/utils/trial_stats.py +++ b/hyperion/utils/trial_stats.py @@ -15,27 +15,28 @@ from .trial_ndx import TrialNdx from .trial_key import TrialKey + class TrialStats(object): - """ Contains anciliary statistics from the trial such us quality measures like SNR - + """Contains anciliary statistics from the trial such us quality measures like SNR + This class was created to store statistics about adversarial attacks like SNR (signal-to-perturbation ratio), Linf, L2 norms of the perturbation etc. Attributes: df_stats: pandas dataframe containing the stats. The dataframe needs to include the modelid and segmentid columns - + """ + def __init__(self, df_stats): self.df_stats = df_stats - assert 'modelid' in df_stats.columns - assert 'segmentid' in df_stats.columns - self.df_stats.set_index(['modelid','segmentid'], inplace=True) + assert "modelid" in df_stats.columns + assert "segmentid" in df_stats.columns + self.df_stats.set_index(["modelid", "segmentid"], inplace=True) self._stats_mats = dict() - @classmethod def load(cls, file_path): - """Loads stats file + """Loads stats file Args: file_path: stats file in csv format @@ -46,7 +47,6 @@ def load(cls, file_path): df = pd.read_csv(file_path) return cls(df) - def save_h5(self, file_path): """Saves object to file. @@ -54,7 +54,6 @@ def save_h5(self, file_path): file_path: CSV format file """ self.df_stats.to_csv(file_path) - def get_stats_mat(self, stat_name, ndx, raise_missing=True): """Returns a matrix of trial statistics sorted to match a give Ndx or Key object @@ -76,30 +75,26 @@ def get_stats_mat(self, stat_name, ndx, raise_missing=True): stats_mat = np.zeros(trial_mask.shape, dtype=float_cpu()) for i in range(stats_mat.shape[0]): for j in range(stats_mat.shape[1]): - if trial_mask[i,j]: + if trial_mask[i, j]: try: - stats_mat[i,j] = self.df_stats.loc[ - ndx.model_set[i], ndx.seg_set[j]][stat_name] + stats_mat[i, j] = self.df_stats.loc[ + ndx.model_set[i], ndx.seg_set[j] + ][stat_name] except: - err_str='%s not found for %s-%s' % ( - stat_name, ndx.model_set[i], ndx.seg_set[j]) + err_str = "%s not found for %s-%s" % ( + stat_name, + ndx.model_set[i], + ndx.seg_set[j], + ) if raise_missing: raise Exception(err_str) else: logging.warning(err_str) - + self._stats_mats[stat_name] = stats_mat return stats_mat - def reset_stats_mats(self): - + for k in list(self._stats_mats.keys()): del self._stats_mats[k] - - - - - - - diff --git a/hyperion/utils/utt2info.py b/hyperion/utils/utt2info.py index 9b2c78e4..3cf4179b 100644 --- a/hyperion/utils/utt2info.py +++ b/hyperion/utils/utt2info.py @@ -19,7 +19,7 @@ class Utt2Info(object): Attributes: key: segment key name. - info: + info: key_to_index: Dictionary that returns the position of a key in the list. """ @@ -29,70 +29,59 @@ def __init__(self, utt_info): self.utt_info.index = self.utt_info.key self.key_to_index = None - def validate(self): - """Validates the attributes of the Utt2Info object. - """ - assert 'key' in self.utt_info.columns + """Validates the attributes of the Utt2Info object.""" + assert "key" in self.utt_info.columns assert self.utt_info.shape[1] >= 2 # assert self.utt_info['key'].nunique() == self.utt_info.shape[0] - @classmethod def create(cls, key, info): key = np.asarray(key) info = np.asarray(info) if info.ndim == 2: - data = np.hstack((key[:,None], info)) + data = np.hstack((key[:, None], info)) else: data = np.vstack((key, info)).T num_columns = data.shape[1] - columns = ['key'] + [i for i in range(1, num_columns)] + columns = ["key"] + [i for i in range(1, num_columns)] utt_info = pd.DataFrame(data, columns=columns) return cls(utt_info) - @property def num_info_fields(self): - return self.utt_info.shape[1]-1 + return self.utt_info.shape[1] - 1 - @property def key(self): - return np.asarray(self.utt_info['key']) + return np.asarray(self.utt_info["key"]) - @property def info(self): if self.utt_info.shape[1] > 2: - return np.asarray(self.utt_info.iloc[:,1:]) + return np.asarray(self.utt_info.iloc[:, 1:]) else: return np.asarray(self.utt_info[1]) - - def copy(self): """Makes a copy of the object.""" return deepcopy(self) - def __len__(self): """Returns the number of elements in the list.""" return len(self.utt_info) - - + def len(self): """Returns the number of elements in the list.""" return len(self.utt_info) - def _create_dict(self): - """Creates dictionary that returns the position of - a segment in the list. + """Creates dictionary that returns the position of + a segment in the list. """ self.key_to_index = OrderedDict( - (k,i) for i, k in enumerate(self.utt_info.index)) - + (k, i) for i, k in enumerate(self.utt_info.index) + ) def get_index(self, key): """Returns the position of key in the list.""" @@ -100,21 +89,19 @@ def get_index(self, key): self._create_dict() return self.key_to_index[key] - def __contains__(self, key): - """ Returns True if the list contains the key""" + """Returns True if the list contains the key""" return key in self.utt_info.index - - + def __getitem__(self, key): - """It allows to acces the data in the list by key or index like in + """It allows to acces the data in the list by key or index like in a ditionary, e.g.: If input is a string key: utt2spk = Utt2Info(info) spk_id = utt2spk['data1'] If input is an index: key, spk_id = utt2spk[0] - + Args: key: String key or integer index. Returns: @@ -136,8 +123,6 @@ def __getitem__(self, key): else: return row[0], row[1:] - - def sort(self, field=0): """Sorts the list by key""" if field == 0: @@ -147,9 +132,7 @@ def sort(self, field=0): self.utt_info = self.utt_info.iloc[idx] self.key_to_index = None - - - def save(self, file_path, sep=' '): + def save(self, file_path, sep=" "): """Saves uttinfo to text file. Args: @@ -158,10 +141,8 @@ def save(self, file_path, sep=' '): """ self.utt_info.to_csv(file_path, sep=sep, header=False, index=False) - - @classmethod - def load(cls, file_path, sep=' ', dtype={0:np.str, 1:np.str}): + def load(cls, file_path, sep=" ", dtype={0: np.str, 1: np.str}): """Loads utt2info list from text file. Args: @@ -172,14 +153,12 @@ def load(cls, file_path, sep=' ', dtype={0:np.str, 1:np.str}): Utt2Info object """ df = pd.read_csv(file_path, sep=sep, header=None, dtype=dtype) - df = df.rename(index=str, columns={0:'key'}) + df = df.rename(index=str, columns={0: "key"}) return cls(df) - - def split(self, idx, num_parts, group_by_field=0): - """ Splits SCPList into num_parts and return part idx. - + """Splits SCPList into num_parts and return part idx. + Args: idx: Part to return from 1 to num_parts. num_parts: Number of parts to split the list. @@ -190,22 +169,22 @@ def split(self, idx, num_parts, group_by_field=0): Sub Utt2Info object """ if group_by_field == 0: - key, idx1 = split_list(self.utt_info['key'], idx, num_parts) + key, idx1 = split_list(self.utt_info["key"], idx, num_parts) else: - key, idx1 = split_list_group_by_key(self.utt_info[group_by_field], idx, num_parts) - + key, idx1 = split_list_group_by_key( + self.utt_info[group_by_field], idx, num_parts + ) + utt_info = self.utt_info.iloc[idx1] return Utt2Info(utt_info) - - @classmethod def merge(cls, info_lists): """Merges several Utt2Info tables. - + Args: info_lists: List of Utt2Info - + Returns: Utt2Info object concatenation the info_lists. """ @@ -213,11 +192,9 @@ def merge(cls, info_lists): utt_info = pd.concat(df_list) return cls(utt_info) - - def filter(self, filter_key, keep=True): """Removes elements from Utt2Info object by key - + Args: filter_key: List with the keys of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -227,15 +204,13 @@ def filter(self, filter_key, keep=True): Utt2Info object. """ if not keep: - filter_key = np.setdiff1d(self.utt_info['key'], filter_key) + filter_key = np.setdiff1d(self.utt_info["key"], filter_key) utt_info = self.utt_info.loc[filter_key] return Utt2Info(utt_info) - - def filter_info(self, filter_key, field=1, keep=True): """Removes elements of Utt2Info by info value - + Args: filter_key: List with the file_path of the elements to keep or remove. field: Field number corresponding to the info to filter @@ -249,19 +224,17 @@ def filter_info(self, filter_key, field=1, keep=True): filter_key = np.setdiff1d(self.utt_info[field], filter_key) f, _ = ismember(filter_key, self.utt_info[field]) if not np.all(f): - for k in filter_key[f==False]: - logging.error('info %s not found in field %d' % (k,field)) - raise Exception('not all keys were found in field %d' % (field)) + for k in filter_key[f == False]: + logging.error("info %s not found in field %d" % (k, field)) + raise Exception("not all keys were found in field %d" % (field)) f, _ = ismember(self.utt_info[field], filter_key) utt_info = self.utt_info.iloc[f] return Utt2Info(utt_info) - - def filter_index(self, index, keep=True): """Removes elements of Utt2Info by index - + Args: filter_key: List with the index of the elements to keep or remove. keep: If True, we keep the elements in filter_key; @@ -271,15 +244,12 @@ def filter_index(self, index, keep=True): Utt2Info object. """ - if not keep : - index = np.setdiff1d(np.arange( - len(self.key), dtype=np.int64), index) + if not keep: + index = np.setdiff1d(np.arange(len(self.key), dtype=np.int64), index) utt_info = self.utt_info.iloc[index] return Utt2Info(utt_info) - - - + def shuffle(self, seed=1024, rng=None): """Shuffles the elements of the list. @@ -298,8 +268,6 @@ def shuffle(self, seed=1024, rng=None): self.key_to_index = None return index - - def __eq__(self, other): """Equal operator""" if self.utt_info.shape[0] == 0 and other.utt_info.shape[0] == 0: @@ -307,18 +275,12 @@ def __eq__(self, other): eq = self.utt_info.equals(other.utt_info) return eq - - def __ne__(self, other): """Non-equal operator""" return not self.__eq__(other) - - def __cmp__(self, other): """Comparison operator""" if self.__eq__(other): return 0 return 1 - - diff --git a/hyperion/utils/vad_utils.py b/hyperion/utils/vad_utils.py index df99ea76..2d68bc5c 100644 --- a/hyperion/utils/vad_utils.py +++ b/hyperion/utils/vad_utils.py @@ -6,19 +6,20 @@ from ..hyp_defs import float_cpu + def _assert_sorted(t): - delta = np.diff(t[:,0]) - assert np.all(delta >=0), 'time-stamps must be sorted' + delta = np.diff(t[:, 0]) + assert np.all(delta >= 0), "time-stamps must be sorted" def _assert_pos_dur(t): - delta = t[:,1] - t[:,0] - assert np.all(delta >=0), 'segments must have positve duration' - + delta = t[:, 1] - t[:, 0] + assert np.all(delta >= 0), "segments must have positve duration" + def merge_vad_timestamps(in_timestamps, tol=0.001): """Merges vad timestamps that are contiguous - + Args: in_timestamps: original time-stamps in start-time, end-time format tol: tolerance, segments separted less than tol will be merged @@ -34,16 +35,16 @@ def merge_vad_timestamps(in_timestamps, tol=0.001): _assert_pos_dur(in_timestamps) # assert segments are shorted by start time - delta = np.diff(in_timestamps[:,0]) - assert np.all(delta >=0), 'time-stamps must be sorted' + delta = np.diff(in_timestamps[:, 0]) + assert np.all(delta >= 0), "time-stamps must be sorted" out_timestamps = np.zeros_like(in_timestamps) - t_start = in_timestamps[0,0] - t_end = in_timestamps[0,1] + t_start = in_timestamps[0, 0] + t_end = in_timestamps[0, 1] j = 0 for i in range(1, in_timestamps.shape[0]): - t_start_i = in_timestamps[i,0] - t_end_i = in_timestamps[i,1] + t_start_i = in_timestamps[i, 0] + t_end_i = in_timestamps[i, 1] if t_end >= t_start_i - tol: # we merge with previous if t_end_i > t_end: @@ -57,17 +58,18 @@ def merge_vad_timestamps(in_timestamps, tol=0.001): out_timestamps[j, 1] = t_end t_start = t_start_i t_end = t_end_i - j +=1 + j += 1 - #write final segment + # write final segment out_timestamps[j, 0] = t_start out_timestamps[j, 1] = t_end - out_timestamps = out_timestamps[:j+1] + out_timestamps = out_timestamps[: j + 1] return out_timestamps - - -def bin_vad_to_timestamps(vad, frame_length, frame_shift, snip_edges=False, merge_tol=0.001): + +def bin_vad_to_timestamps( + vad, frame_length, frame_shift, snip_edges=False, merge_tol=0.001 +): """Converts binary VAD to a list of start end time stamps Args: @@ -80,20 +82,27 @@ def bin_vad_to_timestamps(vad, frame_length, frame_shift, snip_edges=False, merg VAD time stamps refered to the begining of the file """ if snip_edges: - start=0 + start = 0 else: - start = - (frame_length - frame_shift)/2 - - start_timestamps = np.asarray([start + frame_shift*i for i in range(len(vad)) if vad[i]])[:, None] + start = -(frame_length - frame_shift) / 2 + + start_timestamps = np.asarray( + [start + frame_shift * i for i in range(len(vad)) if vad[i]] + )[:, None] end_timestamps = start_timestamps + frame_length - start_timestamps[start_timestamps<0] = 0 + start_timestamps[start_timestamps < 0] = 0 timestamps = np.concatenate((start_timestamps, end_timestamps), axis=1) return merge_vad_timestamps(timestamps, tol=merge_tol) - -def vad_timestamps_to_bin(in_timestamps, frame_length, frame_shift, snip_edges=False, - signal_length=None, max_frames=None): +def vad_timestamps_to_bin( + in_timestamps, + frame_length, + frame_shift, + snip_edges=False, + signal_length=None, + max_frames=None, +): """Converts VAD time-stamps to a binary vector Args: @@ -109,27 +118,36 @@ def vad_timestamps_to_bin(in_timestamps, frame_length, frame_shift, snip_edges=F _assert_pos_dur(in_timestamps) if signal_length is None: - signal_length = in_timestamps[-1,1] + signal_length = in_timestamps[-1, 1] else: - assert signal_length >= in_timestamps[-1,1] + assert signal_length >= in_timestamps[-1, 1] - frame_center = frame_length/2 + frame_center = frame_length / 2 if snip_edges: - num_frames = int(np.floor((signal_length - frame_length + frame_shift)/frame_shift)) + num_frames = int( + np.floor((signal_length - frame_length + frame_shift) / frame_shift) + ) pad = 0 else: - num_frames = int(np.round(signal_length/frame_shift)) - pad = - (frame_length - frame_shift)/2 + num_frames = int(np.round(signal_length / frame_shift)) + pad = -(frame_length - frame_shift) / 2 if max_frames is not None and num_frames < max_frames: num_frames = max_frames vad = np.zeros((num_frames,), dtype=np.bool) - frame_start = np.ceil((in_timestamps[:,0] - (pad + frame_center))/frame_shift).astype(dtype=np.int) - frame_end = np.floor((in_timestamps[:,1] - (pad + frame_center))/frame_shift).astype(dtype=np.int)+1 - frame_start[frame_start<0] = 0 - frame_end[frame_end>num_frames] = num_frames - for i,j in zip(frame_start, frame_end): + frame_start = np.ceil( + (in_timestamps[:, 0] - (pad + frame_center)) / frame_shift + ).astype(dtype=np.int) + frame_end = ( + np.floor((in_timestamps[:, 1] - (pad + frame_center)) / frame_shift).astype( + dtype=np.int + ) + + 1 + ) + frame_start[frame_start < 0] = 0 + frame_end[frame_end > num_frames] = num_frames + for i, j in zip(frame_start, frame_end): if j > i: vad[i:j] = True @@ -137,7 +155,6 @@ def vad_timestamps_to_bin(in_timestamps, frame_length, frame_shift, snip_edges=F vad = vad[:max_frames] return vad - def timestamps_wrt_vad_to_absolute_timestamps(in_timestamps, vad_timestamps): @@ -153,11 +170,11 @@ def timestamps_wrt_vad_to_absolute_timestamps(in_timestamps, vad_timestamps): Returns: Absolute VAD time-stamps """ - - bin_in = vad_timestamps_to_bin( - in_timestamps, frame_length=0.001, frame_shift=0.001) + + bin_in = vad_timestamps_to_bin(in_timestamps, frame_length=0.001, frame_shift=0.001) bin_vad = vad_timestamps_to_bin( - vad_timestamps, frame_length=0.001, frame_shift=0.001) + vad_timestamps, frame_length=0.001, frame_shift=0.001 + ) bin_out = np.zeros_like(bin_vad) j = 0 @@ -168,16 +185,16 @@ def timestamps_wrt_vad_to_absolute_timestamps(in_timestamps, vad_timestamps): j += 1 if j == max_j: break - + out_timestamps = bin_vad_to_timestamps( - bin_out, frame_length=0.001, frame_shift=0.001, merge_tol=0.001) + bin_out, frame_length=0.001, frame_shift=0.001, merge_tol=0.001 + ) return out_timestamps - - - + def timestamps_wrt_bin_vad_to_absolute_timestamps( - in_timestamps, vad, frame_length, frame_shift, snip_edges=False): + in_timestamps, vad, frame_length, frame_shift, snip_edges=False +): """Converts time stamps relative to a signal with silence removed to absoulute time stamps in the original signal @@ -192,25 +209,23 @@ def timestamps_wrt_bin_vad_to_absolute_timestamps( Returns: Absolute VAD time-stamps """ - vad_timestamps = bin_vad_to_timestamps( - vad, frame_length, frame_shift, snip_edges) - return timestamps_wrt_vad_to_absolute_timestamps( - in_timestamps, vad_timestamps) + vad_timestamps = bin_vad_to_timestamps(vad, frame_length, frame_shift, snip_edges) + return timestamps_wrt_vad_to_absolute_timestamps(in_timestamps, vad_timestamps) def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): - """ Intersects a list of segment timestamps with a VAD time-stamps - It returns only the segments that contain speech modifying + """Intersects a list of segment timestamps with a VAD time-stamps + It returns only the segments that contain speech modifying the start and end times to remove silence from the segments. Args: in_timestamps: time stamps of a list of segments refered to time 0. - vad_timestamps: vad timestamps + vad_timestamps: vad timestamps Returns: Boolean array indicating which input segments contain speech Array of output segments with silence removed - Array of indices, one index for each output segment indicating to which + Array of indices, one index for each output segment indicating to which input speech segment correspond to. The index correspond to input segments after removing input segments that only contain silence. """ @@ -227,7 +242,7 @@ def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): vad_start = vad_timestamps[:, 0] vad_end = vad_timestamps[:, 1] num_vad_segs = len(vad_start) - speech_idx = np.zeros((in_timestamps.shape[0],), dtype = np.bool) + speech_idx = np.zeros((in_timestamps.shape[0],), dtype=np.bool) out_timestamps = [] out_timestamps2speech_segs = [] count_speech = 0 @@ -239,13 +254,13 @@ def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): j += 1 if j == num_vad_segs: - break + break k = j while t_start < t_end: if k == num_vad_segs or vad_start[k] >= t_end or vad_end[k] <= t_start: break - #print('...', vad_start[k], vad_end[k], t_start, t_end) + # print('...', vad_start[k], vad_end[k], t_start, t_end) is_speech = True if vad_start[k] <= t_start: if vad_end[k] < t_end: @@ -262,9 +277,9 @@ def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): else: new_seg = [vad_start[k], t_end] t_start = t_end - + out_timestamps.append(new_seg) - #print('......', out_timestamps) + # print('......', out_timestamps) out_timestamps2speech_segs.append(count_speech) k += 1 @@ -274,5 +289,5 @@ def intersect_segment_timestamps_with_vad(in_timestamps, vad_timestamps): out_timestamps = np.asarray(out_timestamps) out_timestamps2speech_segs = np.asarray(out_timestamps2speech_segs, dtype=np.int) - + return speech_idx, out_timestamps, out_timestamps2speech_segs diff --git a/hyperion/vb_pdfs/core/exponential_family.py b/hyperion/vb_pdfs/core/exponential_family.py index 7ccc2b77..c3e59040 100644 --- a/hyperion/vb_pdfs/core/exponential_family.py +++ b/hyperion/vb_pdfs/core/exponential_family.py @@ -8,54 +8,49 @@ from abc import ABCMeta, abstractmethod from .pdf import PDF + class ExpFamily(PDF): __metaclass__ = ABCMeta - + def __init__(self, eta=None, **kwargs): super(ExpFamily, self).__init__(**kwargs) self.eta = eta self.A = None - - def fit(self, x, sample_weight=None, - x_val=None, sample_weight_val=None, batch_size=None): + def fit( + self, x, sample_weight=None, x_val=None, sample_weight_val=None, batch_size=None + ): - N, u_x =self.Estep(x=x, sample_weight=sample_weight, - batch_size=batch_size) + N, u_x = self.Estep(x=x, sample_weight=sample_weight, batch_size=batch_size) self.Mstep(N, u_x) - elbo=self.elbo(x, N=N, u_x=u_x) - elbo = [elbo, elbo/N] - + elbo = self.elbo(x, N=N, u_x=u_x) + elbo = [elbo, elbo / N] + if x_val is not None: - N, u_x = self.Estep(x=x_val, sample_weight=sample_weight_val, - batch_size=batch_size) + N, u_x = self.Estep( + x=x_val, sample_weight=sample_weight_val, batch_size=batch_size + ) elbo_val = self.elbo(x_val, N=N, u_x=u_x) - elbo += [elbo_val, elbo_val/N] + elbo += [elbo_val, elbo_val / N] return elbo - def log_h(self, x): return 0 - def accum_logh(self, x, sample_weight=None): if sample_weight is None: return np.sum(self.logh(x)) return np.sum(sample_weight * self.logh(x)) - - + def compute_suff_stats(self, x): return x - - def accum_suff_stats(self, x, u_x=None, sample_weight=None, batch_size=None): if u_x is not None or batch_size is None: return self._accum_suff_stats_1batch(x, u_x, sample_weight) else: return self._accum_suff_stats_nbatches(x, sample_weight, batch_size) - def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): if u_x is None: u_x = self.compute_suff_stats(x) @@ -64,15 +59,14 @@ def _accum_suff_stats_1batch(self, x, u_x=None, sample_weight=None): else: u_x *= sample_weight[:, None] N = np.sum(sample_weight) - acc_u_x=np.sum(u_x, axis=0) + acc_u_x = np.sum(u_x, axis=0) return N, acc_u_x - def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): sw_i = None for i1 in range(0, x.shape[0], batch_size): - i2 = np.minimum(i1+batch_size, x.shape[0]) - x_i = x[i1:i2,:] + i2 = np.minimum(i1 + batch_size, x.shape[0]) + x_i = x[i1:i2, :] if sample_weight is not None: sw_i = sample_weight[i1:i2] N_i, u_x_i = self._accum_suff_stats_1batch(x_i, sample_weight=sw_i) @@ -84,77 +78,62 @@ def _accum_suff_stats_nbatches(self, x, sample_weight, batch_size): u_x += u_x_i return N, u_x - def add_suff_stats(self, N, u_x): - assert(len(N)==len(u_x)) + assert len(N) == len(u_x) acc_N = N[1] acc_u_x = u_x[1] - for i in range(1,len(N)): + for i in range(1, len(N)): acc_N += N acc_u_x += u[i] return acc_N, acc_u_x - - def Estep(self, x, u_x=None, sample_weight=None, batch_size=None): return self.accum_suff_stats(x, u_x, sample_weight, batch_size) - @abstractmethod def Mstep(self, stats): pass - def elbo(self, x, u_x=None, N=1, logh=None, sample_weight=None, batch_size=None): if u_x is None: - N, u_x = self.accum_suff_stats(x, sample_weight=sample_weight, - batch_size=batch_size) + N, u_x = self.accum_suff_stats( + x, sample_weight=sample_weight, batch_size=batch_size + ) if logh is None: logh = self.accum_logh(x, sample_weight=sample_weight) - return logh + np.inner(u_x, self.eta) - N*self.A + return logh + np.inner(u_x, self.eta) - N * self.A - - def eval_llk(self, x, u_x=None, mode='nat'): - if mode == 'nat': + def eval_llk(self, x, u_x=None, mode="nat"): + if mode == "nat": return self.eval_llk_nat(x, u_x) else: return self.eval_llk_std(x) - - def eval_llk_nat(self, x, u_x = None): + def eval_llk_nat(self, x, u_x=None): if u_x is None: u_x = self.compute_suff_stats(x) return self.logh(x) + np.inner(u_x, self.eta) - self.A - - @staticmethod def compute_A_nat(eta): raise NotImplementedError() - @staticmethod def compute_A_std(params): raise NotImplementedError() - @staticmethod def compute_eta(param): raise NotImplementedError() - @staticmethod def compute_std(eta): raise NotImplementedError() - @abstractmethod def _compute_nat_params(self): pass - @abstractmethod def _compute_std_params(self): pass - - diff --git a/hyperion/vb_pdfs/core/pdf.py b/hyperion/vb_pdfs/core/pdf.py index 822889bd..012ff96c 100644 --- a/hyperion/vb_pdfs/core/pdf.py +++ b/hyperion/vb_pdfs/core/pdf.py @@ -8,29 +8,25 @@ from abc import ABCMeta, abstractmethod from ...hyp_model import HypModel + class PDF(HypModel): __metaclass__ = ABCMeta def __init__(self, **kwargs): super(PDF, self).__init__(**kwargs) - # def get_config(self): # config = {'x_dim': self.x_dim } # base_config = super(PDF, self).get_config() # return dict(list(base_config.items()) + list(config.items())) - @abstractmethod def log_prob(self, x): pass - def log_cdf(self, x): raise NotImplementedError - @abstractmethod def sample(self, num_samples): pass - diff --git a/tests/hyperion/feats/test_energy_vad.py b/tests/hyperion/feats/test_energy_vad.py index 713cb637..c4155b37 100644 --- a/tests/hyperion/feats/test_energy_vad.py +++ b/tests/hyperion/feats/test_energy_vad.py @@ -10,17 +10,18 @@ from hyperion.hyp_defs import float_cpu from hyperion.feats.energy_vad import EnergyVAD -fs=16000 +fs = 16000 + def generate_signal(): - rng = np.random.RandomState(seed = 1024) - s = (2**3)*rng.randn(fs*10).astype(float_cpu(), copy=False) + rng = np.random.RandomState(seed=1024) + s = (2 ** 3) * rng.randn(fs * 10).astype(float_cpu(), copy=False) vad = np.zeros((len(s),), dtype=bool) - vad[2*fs:8*fs] = True - s += (2**12)*vad.astype(dtype=float_cpu())*np.sign(s) + vad[2 * fs : 8 * fs] = True + s += (2 ** 12) * vad.astype(dtype=float_cpu()) * np.sign(s) vad = vad[::160] - #s = rng.randn(fs*10).astype(float_cpu(), copy=False) + # s = rng.randn(fs*10).astype(float_cpu(), copy=False) return s, vad @@ -30,7 +31,6 @@ def generate_signal(): def test_vad(): e_vad = EnergyVAD() vad_est = e_vad.compute(s) - print(np.max(s[2*fs:3*fs]), np.min(s[2*fs:3*fs])) + print(np.max(s[2 * fs : 3 * fs]), np.min(s[2 * fs : 3 * fs])) - assert np.mean(vad[:len(vad_est)]==vad_est) > 0.9 - + assert np.mean(vad[: len(vad_est)] == vad_est) > 0.9 diff --git a/tests/hyperion/feats/test_feature_normalization.py b/tests/hyperion/feats/test_feature_normalization.py index dae32ec4..eaae4225 100644 --- a/tests/hyperion/feats/test_feature_normalization.py +++ b/tests/hyperion/feats/test_feature_normalization.py @@ -14,12 +14,13 @@ def generate_features(): - rng = np.random.RandomState(seed = 1024) - x = rng.randn(60*100,2).astype(float_cpu(), copy=False) - x *= rng.rand(60*100,1) - + rng = np.random.RandomState(seed=1024) + x = rng.randn(60 * 100, 2).astype(float_cpu(), copy=False) + x *= rng.rand(60 * 100, 1) + return x + x = generate_features() @@ -38,8 +39,9 @@ def test_mvn_global(): def test_stmvn(): - mvn = MeanVarianceNorm(norm_mean=True, norm_var=False, - left_context=150, right_context=50) + mvn = MeanVarianceNorm( + norm_mean=True, norm_var=False, left_context=150, right_context=50 + ) x_norm = mvn.normalize(x) x_ref = mvn.normalize_slow(x) # idx=np.argmax(np.abs(x_norm-x_ref)) @@ -52,35 +54,37 @@ def test_stmvn(): # print(x_ref[-10:]) assert_allclose(x_norm, x_ref, atol=1e-4) - mvn = MeanVarianceNorm(norm_mean=True, norm_var=True, - left_context=150, right_context=50) + mvn = MeanVarianceNorm( + norm_mean=True, norm_var=True, left_context=150, right_context=50 + ) x_norm = mvn.normalize(x) x_ref = mvn.normalize_slow(x) assert_allclose(x_norm, x_ref, atol=1e-4) - def test_mvn_cum_forward(): - mvn = MeanVarianceNorm(norm_mean=True, norm_var=False, - left_context=None, right_context=0) + mvn = MeanVarianceNorm( + norm_mean=True, norm_var=False, left_context=None, right_context=0 + ) x_norm = mvn.normalize(x) x_ref = mvn.normalize_slow(x) assert_allclose(x_norm, x_ref, atol=1e-4) - mvn = MeanVarianceNorm(norm_mean=True, norm_var=True, - left_context=None, right_context=0) + mvn = MeanVarianceNorm( + norm_mean=True, norm_var=True, left_context=None, right_context=0 + ) x_norm = mvn.normalize(x) x_ref = mvn.normalize_slow(x) assert_allclose(x_norm, x_ref, atol=1e-4) - def test_mvn_cum_backward(): - mvn = MeanVarianceNorm(norm_mean=True, norm_var=False, - left_context=0, right_context=None) + mvn = MeanVarianceNorm( + norm_mean=True, norm_var=False, left_context=0, right_context=None + ) x_norm = mvn.normalize(x) x_ref = mvn.normalize_slow(x) # idx=np.argmax(np.abs(x_norm-x_ref)) @@ -93,11 +97,9 @@ def test_mvn_cum_backward(): # print(x_ref[-10:]) assert_allclose(x_norm, x_ref, atol=1e-4) - mvn = MeanVarianceNorm(norm_mean=True, norm_var=True, - left_context=0, right_context=None) + mvn = MeanVarianceNorm( + norm_mean=True, norm_var=True, left_context=0, right_context=None + ) x_norm = mvn.normalize(x) x_ref = mvn.normalize_slow(x) assert_allclose(x_norm, x_ref, atol=1e-4) - - - diff --git a/tests/hyperion/feats/test_frame_selector.py b/tests/hyperion/feats/test_frame_selector.py index c5b4527f..5513a657 100644 --- a/tests/hyperion/feats/test_frame_selector.py +++ b/tests/hyperion/feats/test_frame_selector.py @@ -14,21 +14,21 @@ def generate_features(): - rng = np.random.RandomState(seed = 1024) - x = rng.randn(10,2).astype(float_cpu(), copy=False) - vad = np.zeros((10,), dtype='bool') - vad[4:8]=1 - return x,vad + rng = np.random.RandomState(seed=1024) + x = rng.randn(10, 2).astype(float_cpu(), copy=False) + vad = np.zeros((10,), dtype="bool") + vad[4:8] = 1 + return x, vad -x,vad = generate_features() +x, vad = generate_features() def test_select(): fs = FrameSelector(tol_num_frames=3) - y = fs.select(x,vad) + y = fs.select(x, vad) assert_allclose(x[4:8], y) @@ -36,9 +36,8 @@ def test_select_missmatch_num_frames(): fs = FrameSelector(tol_num_frames=3) - y = fs.select(x[:8],vad) + y = fs.select(x[:8], vad) assert_allclose(x[4:8], y) - y = fs.select(x,vad[:8]) + y = fs.select(x, vad[:8]) assert_allclose(x[4:8], y) - diff --git a/tests/hyperion/feats/test_mfcc.py b/tests/hyperion/feats/test_mfcc.py index eff68c99..bb9ae386 100644 --- a/tests/hyperion/feats/test_mfcc.py +++ b/tests/hyperion/feats/test_mfcc.py @@ -10,49 +10,54 @@ from hyperion.hyp_defs import float_cpu from hyperion.feats.mfcc import MFCC -fs=16000 -window_type = 'povey' +fs = 16000 +window_type = "povey" + def generate_signal(): - rng = np.random.RandomState(seed = 1024) - s = (2**10)*rng.randn(fs*10).astype(float_cpu(), copy=False) - #s = rng.randn(fs*10).astype(float_cpu(), copy=False) + rng = np.random.RandomState(seed=1024) + s = (2 ** 10) * rng.randn(fs * 10).astype(float_cpu(), copy=False) + # s = rng.randn(fs*10).astype(float_cpu(), copy=False) return s + s = generate_signal() + def test_mfcc(): mfcc = MFCC(window_type=window_type) P = mfcc.compute(s) - + def test_mfcc_return_all(): mfcc = MFCC(window_type=window_type) - P, X, F, B = mfcc.compute(s, return_fft=True, return_fft_mag=True, return_logfb=True) + P, X, F, B = mfcc.compute( + s, return_fft=True, return_fft_mag=True, return_logfb=True + ) def test_mfcc_etsi(): - mfcc = MFCC(window_type=window_type, fb_type='mel_etsi') + mfcc = MFCC(window_type=window_type, fb_type="mel_etsi") P = mfcc.compute(s) def test_mfcc_linear(): - mfcc = MFCC(window_type=window_type, fb_type='linear') + mfcc = MFCC(window_type=window_type, fb_type="linear") P = mfcc.compute(s) def test_mfcc_from_fft(): - + mfcc = MFCC(window_type=window_type) P = mfcc.compute(s) - mfcc_1 = MFCC(window_type=window_type, output_step='fft') - mfcc_2 = MFCC(window_type=window_type, input_step='fft') + mfcc_1 = MFCC(window_type=window_type, output_step="fft") + mfcc_2 = MFCC(window_type=window_type, input_step="fft") X = mfcc_1.compute(s) P2 = mfcc_2.compute(X) @@ -61,12 +66,12 @@ def test_mfcc_from_fft(): def test_mfcc_from_fft_mag(): - + mfcc = MFCC(window_type=window_type) P = mfcc.compute(s) - mfcc_1 = MFCC(window_type=window_type, output_step='fft_mag') - mfcc_2 = MFCC(window_type=window_type, input_step='fft_mag') + mfcc_1 = MFCC(window_type=window_type, output_step="fft_mag") + mfcc_2 = MFCC(window_type=window_type, input_step="fft_mag") F = mfcc_1.compute(s) P2 = mfcc_2.compute(F) @@ -75,18 +80,14 @@ def test_mfcc_from_fft_mag(): def test_mfcc_from_logfb(): - + mfcc = MFCC(window_type=window_type) P = mfcc.compute(s) - mfcc_1 = MFCC(window_type=window_type, output_step='logfb') - mfcc_2 = MFCC(window_type=window_type, input_step='logfb') + mfcc_1 = MFCC(window_type=window_type, output_step="logfb") + mfcc_2 = MFCC(window_type=window_type, input_step="logfb") B = mfcc_1.compute(s) P2 = mfcc_2.compute(B) assert_allclose(P, P2, rtol=1e-5) - - - - diff --git a/tests/hyperion/feats/test_stft.py b/tests/hyperion/feats/test_stft.py index 49db82e0..b8af5d84 100644 --- a/tests/hyperion/feats/test_stft.py +++ b/tests/hyperion/feats/test_stft.py @@ -11,66 +11,63 @@ from hyperion.feats.stft import * from hyperion.feats.feature_windows import FeatureWindowFactory as FWF -margin=10 +margin = 10 + def generate_signal(): - fs=16000 - rng = np.random.RandomState(seed = 1024) - s = (2**10)*rng.randn(fs*10).astype(float_cpu(), copy=False) + fs = 16000 + rng = np.random.RandomState(seed=1024) + s = (2 ** 10) * rng.randn(fs * 10).astype(float_cpu(), copy=False) return s + s = generate_signal() def test_stft_hanning_half(): - w = FWF.create('hanning', 512) - + w = FWF.create("hanning", 512) + X = stft(s, frame_length=512, frame_shift=256, fft_length=512, window=w) shat = np.real(istft(X, frame_length=512, frame_shift=256, window=w)) - s_ref = s[margin:shat.shape[0]-margin] + s_ref = s[margin : shat.shape[0] - margin] shat = shat[margin:-margin] assert_allclose(s_ref, shat, rtol=1e-3, atol=1e-1) def test_strft_hanning_half(): - w = FWF.create('hanning', 512) - + w = FWF.create("hanning", 512) + X = strft(s, frame_length=512, frame_shift=256, fft_length=512, window=w) shat = istrft(X, frame_length=512, frame_shift=256, window=w) - s_ref = s[margin:shat.shape[0]-margin] + s_ref = s[margin : shat.shape[0] - margin] shat = shat[margin:-margin] assert_allclose(s_ref, shat, rtol=1e-3, atol=1e-1) def test_stft_povey_10hz(): - w = FWF.create('povey', 400) - + w = FWF.create("povey", 400) + X = stft(s, frame_length=400, frame_shift=160, fft_length=512, window=w) shat = np.real(istft(X, frame_length=400, frame_shift=160, window=w)) - s_ref = s[margin:shat.shape[0]-margin] + s_ref = s[margin : shat.shape[0] - margin] shat = shat[margin:-margin] assert_allclose(s_ref, shat, rtol=1e-4, atol=1e-2) - def test_strft_povey_10hz(): - w = FWF.create('povey', 400) - + w = FWF.create("povey", 400) + X = strft(s, frame_length=400, frame_shift=160, fft_length=512, window=w) shat = istrft(X, frame_length=400, frame_shift=160, window=w) - s_ref = s[margin:shat.shape[0]-margin] + s_ref = s[margin : shat.shape[0] - margin] shat = shat[margin:-margin] assert_allclose(s_ref, shat, rtol=1e-4, atol=1e-2) - - - - diff --git a/tests/hyperion/helpers/test_vector_class_reader.py b/tests/hyperion/helpers/test_vector_class_reader.py index 85c9b64e..5dc386d5 100644 --- a/tests/hyperion/helpers/test_vector_class_reader.py +++ b/tests/hyperion/helpers/test_vector_class_reader.py @@ -12,91 +12,97 @@ from hyperion.io import H5DataWriter from hyperion.helpers import VectorClassReader -output_dir = './tests/data_out/helpers' +output_dir = "./tests/data_out/helpers" if not os.path.exists(output_dir): os.makedirs(output_dir) def create_u2c(): key = [str(k) for k in range(10)] - classes = [ 'c1' ] + ['c3']*6 + [ 'c2' ]*3 + classes = ["c1"] + ["c3"] * 6 + ["c2"] * 3 u2c = Utt2Info.create(key, classes) return u2c def test__filter_by_spc_min_spc(): u2c_in = create_u2c() - + u2c_out = VectorClassReader._filter_by_spc(u2c_in, min_spc=2) u2c_gt = Utt2Info.create(u2c_in.key[1:], u2c_in.info[1:]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt u2c_out = VectorClassReader._filter_by_spc(u2c_in, min_spc=4) u2c_gt = Utt2Info.create(u2c_in.key[1:7], u2c_in.info[1:7]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt u2c_out = VectorClassReader._filter_by_spc(u2c_in, min_spc=7) u2c_gt = Utt2Info.create([], []) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt + - def test__filter_by_spc_max_spc(): - + u2c_in = create_u2c() - u2c_out = VectorClassReader._filter_by_spc(u2c_in, max_spc=4, - spc_pruning_mode='last') + u2c_out = VectorClassReader._filter_by_spc( + u2c_in, max_spc=4, spc_pruning_mode="last" + ) f = np.ones_like(u2c_in.key, dtype=bool) f[5:7] = False u2c_gt = Utt2Info.create(u2c_in.key[f], u2c_in.info[f]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt - u2c_out = VectorClassReader._filter_by_spc(u2c_in, max_spc=4, - spc_pruning_mode='first') + u2c_out = VectorClassReader._filter_by_spc( + u2c_in, max_spc=4, spc_pruning_mode="first" + ) f = np.ones_like(u2c_in.key, dtype=bool) f[1:3] = False u2c_gt = Utt2Info.create(u2c_in.key[f], u2c_in.info[f]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt - rng = np.random.RandomState(1024) - u2c_out = VectorClassReader._filter_by_spc(u2c_in, max_spc=4, - spc_pruning_mode='random', rng=rng) + rng = np.random.RandomState(1024) + u2c_out = VectorClassReader._filter_by_spc( + u2c_in, max_spc=4, spc_pruning_mode="random", rng=rng + ) f = np.ones_like(u2c_in.key, dtype=bool) f[3] = False f[6] = False u2c_gt = Utt2Info.create(u2c_in.key[f], u2c_in.info[f]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt def test__filter_by_spc_min_max_spc(): - + u2c_in = create_u2c() - u2c_out = VectorClassReader._filter_by_spc(u2c_in, min_spc=2, max_spc=4, - spc_pruning_mode='last') + u2c_out = VectorClassReader._filter_by_spc( + u2c_in, min_spc=2, max_spc=4, spc_pruning_mode="last" + ) f = np.ones_like(u2c_in.key, dtype=bool) f[0] = False f[5:7] = False u2c_gt = Utt2Info.create(u2c_in.key[f], u2c_in.info[f]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt - u2c_out = VectorClassReader._filter_by_spc(u2c_in, min_spc=2, max_spc=4, - spc_pruning_mode='first') + u2c_out = VectorClassReader._filter_by_spc( + u2c_in, min_spc=2, max_spc=4, spc_pruning_mode="first" + ) f = np.ones_like(u2c_in.key, dtype=bool) - f[:3]=False + f[:3] = False u2c_gt = Utt2Info.create(u2c_in.key[f], u2c_in.info[f]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) - u2c_out=VectorClassReader._filter_by_spc(u2c_in, min_spc=2, max_spc=4, - spc_pruning_mode='random', rng=rng) + u2c_out = VectorClassReader._filter_by_spc( + u2c_in, min_spc=2, max_spc=4, spc_pruning_mode="random", rng=rng + ) f = np.ones_like(u2c_in.key, dtype=bool) f[0] = False f[3] = False f[6] = False u2c_gt = Utt2Info.create(u2c_in.key[f], u2c_in.info[f]) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt def test__split_classes_sequential_nonoverlap(): @@ -104,102 +110,128 @@ def test__split_classes_sequential_nonoverlap(): u2c_in = create_u2c() u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=3, mode='sequential') + u2c_in, min_spc=1, max_spc=3, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','1','2','3','4','5','6'], - ['0','1','1','1','2','2','2','3','3','3']) - assert u2c_out==u2c_gt - + ["0", "7", "8", "9", "1", "2", "3", "4", "5", "6"], + ["0", "1", "1", "1", "2", "2", "2", "3", "3", "3"], + ) + assert u2c_out == u2c_gt + u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=2, mode='sequential') + u2c_in, min_spc=1, max_spc=2, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','1','2','3','4','5','6'], - ['0','1','1','2','3','3','4','4','5','5']) - assert u2c_out==u2c_gt + ["0", "7", "8", "9", "1", "2", "3", "4", "5", "6"], + ["0", "1", "1", "2", "3", "3", "4", "4", "5", "5"], + ) + assert u2c_out == u2c_gt u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=4, mode='sequential') + u2c_in, min_spc=1, max_spc=4, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','1','2','3','4','5','6'], - ['0','1','1','1','2','2','2','2','3','3']) - assert u2c_out==u2c_gt - + ["0", "7", "8", "9", "1", "2", "3", "4", "5", "6"], + ["0", "1", "1", "1", "2", "2", "2", "2", "3", "3"], + ) + assert u2c_out == u2c_gt + u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=2, max_spc=3, mode='sequential') + u2c_in, min_spc=2, max_spc=3, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['7','8','9','1','2','3','4','5','6'], - ['1','1','1','2','2','2','3','3','3']) - assert u2c_out==u2c_gt + ["7", "8", "9", "1", "2", "3", "4", "5", "6"], + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ) + assert u2c_out == u2c_gt u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=2, max_spc=2, mode='sequential') + u2c_in, min_spc=2, max_spc=2, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['7','8','1','2','3','4','5','6'], - ['1','1','3','3','4','4','5','5']) - assert u2c_out==u2c_gt + ["7", "8", "1", "2", "3", "4", "5", "6"], + ["1", "1", "3", "3", "4", "4", "5", "5"], + ) + assert u2c_out == u2c_gt u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=4, max_spc=4, mode='sequential') - u2c_gt = Utt2Info.create( - ['1','2','3','4'], ['2','2','2','2']) - assert u2c_out==u2c_gt + u2c_in, min_spc=4, max_spc=4, mode="sequential" + ) + u2c_gt = Utt2Info.create(["1", "2", "3", "4"], ["2", "2", "2", "2"]) + assert u2c_out == u2c_gt def test__split_classes_random_nonoverlap(): - + u2c_in = create_u2c() rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=3, mode='random', rng=rng) + u2c_in, min_spc=1, max_spc=3, mode="random", rng=rng + ) print(u2c_out.key) print(u2c_out.info) - u2c_gt = Utt2Info.create(['0','1','1','1','2','2','2','3','3','3'], - ['0','7','8','9','3','6','1','3','4','5']) - assert u2c_out==u2c_gt + u2c_gt = Utt2Info.create( + ["0", "1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["0", "7", "8", "9", "3", "6", "1", "3", "4", "5"], + ) + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=2, mode='random', rng=rng) + u2c_in, min_spc=1, max_spc=2, mode="random", rng=rng + ) print(u2c_out.key) print(u2c_out.info) - u2c_gt = Utt2Info.create(['0','1','1','2','2','3','3','4','4','5','5'], - ['0','7','9','9','8','3','4','5','2','3','1']) - assert u2c_out==u2c_gt + u2c_gt = Utt2Info.create( + ["0", "1", "1", "2", "2", "3", "3", "4", "4", "5", "5"], + ["0", "7", "9", "9", "8", "3", "4", "5", "2", "3", "1"], + ) + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=4, mode='random', rng=rng) + u2c_in, min_spc=1, max_spc=4, mode="random", rng=rng + ) print(u2c_out.key) print(u2c_out.info) - u2c_gt = Utt2Info.create(['0','1','1','1','2','2','2','2','3','3','3','3'], - ['0','7','8','9','3','6','1','5','3','4','5','1']) - assert u2c_out==u2c_gt - + u2c_gt = Utt2Info.create( + ["0", "1", "1", "1", "2", "2", "2", "2", "3", "3", "3", "3"], + ["0", "7", "8", "9", "3", "6", "1", "5", "3", "4", "5", "1"], + ) + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=2, max_spc=3, mode='random', rng=rng) + u2c_in, min_spc=2, max_spc=3, mode="random", rng=rng + ) print(u2c_out.key) print(u2c_out.info) - u2c_gt = Utt2Info.create(['1','1','1','2','2','2','3','3','3'], - ['7','8','9','3','6','1','3','4','5']) - assert u2c_out==u2c_gt + u2c_gt = Utt2Info.create( + ["1", "1", "1", "2", "2", "2", "3", "3", "3"], + ["7", "8", "9", "3", "6", "1", "3", "4", "5"], + ) + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=2, max_spc=2, mode='random', rng=rng) - u2c_gt = Utt2Info.create(['1','1','2','2','3','3','4','4','5','5'], - ['7','9','9','8','3','4','5','2','3','1']) - assert u2c_out==u2c_gt + u2c_in, min_spc=2, max_spc=2, mode="random", rng=rng + ) + u2c_gt = Utt2Info.create( + ["1", "1", "2", "2", "3", "3", "4", "4", "5", "5"], + ["7", "9", "9", "8", "3", "4", "5", "2", "3", "1"], + ) + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=4, max_spc=4, mode='random', rng=rng) - u2c_gt = Utt2Info.create(['2','2','2','2','3','3','3','3'], - ['3','6','1','5','3','4','5','1']) - assert u2c_out==u2c_gt - + u2c_in, min_spc=4, max_spc=4, mode="random", rng=rng + ) + u2c_gt = Utt2Info.create( + ["2", "2", "2", "2", "3", "3", "3", "3"], + ["3", "6", "1", "5", "3", "4", "5", "1"], + ) + assert u2c_out == u2c_gt def test__split_classes_sequential_overlap(): @@ -207,102 +239,257 @@ def test__split_classes_sequential_overlap(): u2c_in = create_u2c() u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=3, overlap=1, mode='sequential') + u2c_in, min_spc=1, max_spc=3, overlap=1, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','1','2','3','3','4','5','5','6'], - ['0','1','1','1','2','2','2','3','3','3','4','4']) - assert u2c_out==u2c_gt + ["0", "7", "8", "9", "1", "2", "3", "3", "4", "5", "5", "6"], + ["0", "1", "1", "1", "2", "2", "2", "3", "3", "3", "4", "4"], + ) + assert u2c_out == u2c_gt u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=3, overlap=2, mode='sequential') + u2c_in, min_spc=1, max_spc=3, overlap=2, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','1','2','3','2','3','4','3','4','5','4','5','6'], - ['0','1','1','1','2','2','2','3','3','3','4','4','4','5','5','5']) - assert u2c_out==u2c_gt + [ + "0", + "7", + "8", + "9", + "1", + "2", + "3", + "2", + "3", + "4", + "3", + "4", + "5", + "4", + "5", + "6", + ], + [ + "0", + "1", + "1", + "1", + "2", + "2", + "2", + "3", + "3", + "3", + "4", + "4", + "4", + "5", + "5", + "5", + ], + ) + assert u2c_out == u2c_gt u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=2, overlap=1, mode='sequential') + u2c_in, min_spc=1, max_spc=2, overlap=1, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','8','9','1','2','2','3','3','4','4','5','5','6'], - ['0','1','1','2','2','3','3','4','4','5','5','6','6','7','7']) - assert u2c_out==u2c_gt + ["0", "7", "8", "8", "9", "1", "2", "2", "3", "3", "4", "4", "5", "5", "6"], + ["0", "1", "1", "2", "2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7"], + ) + assert u2c_out == u2c_gt u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=4, overlap=3, mode='sequential') + u2c_in, min_spc=1, max_spc=4, overlap=3, mode="sequential" + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','1','2','3','4','2','3','4','5','3','4','5','6'], - ['0','1','1','1','2','2','2','2','3','3','3','3','4','4','4','4']) - assert u2c_out==u2c_gt + [ + "0", + "7", + "8", + "9", + "1", + "2", + "3", + "4", + "2", + "3", + "4", + "5", + "3", + "4", + "5", + "6", + ], + [ + "0", + "1", + "1", + "1", + "2", + "2", + "2", + "2", + "3", + "3", + "3", + "3", + "4", + "4", + "4", + "4", + ], + ) + assert u2c_out == u2c_gt - def test__split_classes_random_nonoverlap(): - + u2c_in = create_u2c() rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=3, overlap=2, mode='random', rng=rng) + u2c_in, min_spc=1, max_spc=3, overlap=2, mode="random", rng=rng + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','3','6','1','3','4','5','5','2','6','3','1','5'], - ['0','1','1','1','2','2','2','3','3','3','4','4','4','5','5','5']) - - assert u2c_out==u2c_gt + [ + "0", + "7", + "8", + "9", + "3", + "6", + "1", + "3", + "4", + "5", + "5", + "2", + "6", + "3", + "1", + "5", + ], + [ + "0", + "1", + "1", + "1", + "2", + "2", + "2", + "3", + "3", + "3", + "4", + "4", + "4", + "5", + "5", + "5", + ], + ) + + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=2, overlap=1, mode='random', rng=rng) + u2c_in, min_spc=1, max_spc=2, overlap=1, mode="random", rng=rng + ) u2c_gt = Utt2Info.create( - ['0','7','9','9','8','3','4','5','2','3','1','3','5','3','2'], - ['0','1','1','2','2','3','3','4','4','5','5','6','6','7','7']) - - assert u2c_out==u2c_gt + ["0", "7", "9", "9", "8", "3", "4", "5", "2", "3", "1", "3", "5", "3", "2"], + ["0", "1", "1", "2", "2", "3", "3", "4", "4", "5", "5", "6", "6", "7", "7"], + ) + + assert u2c_out == u2c_gt rng = np.random.RandomState(1024) u2c_out = VectorClassReader._split_classes( - u2c_in, min_spc=1, max_spc=4, overlap=2, mode='random', rng=rng) + u2c_in, min_spc=1, max_spc=4, overlap=2, mode="random", rng=rng + ) u2c_gt = Utt2Info.create( - ['0','7','8','9','3','6','1','5','3','4','5','1'], - ['0','1','1','1','2','2','2','2','3','3','3','3']) + ["0", "7", "8", "9", "3", "6", "1", "5", "3", "4", "5", "1"], + ["0", "1", "1", "1", "2", "2", "2", "2", "3", "3", "3", "3"], + ) - assert u2c_out==u2c_gt + assert u2c_out == u2c_gt - def test_vector_class_reader(): - v_file = output_dir + '/vcr.h5' - key_file = output_dir + '/vcr.u2c' + v_file = output_dir + "/vcr.h5" + key_file = output_dir + "/vcr.u2c" u2c = create_u2c() - x = np.random.randn(len(u2c.key),2).astype('float32') - + x = np.random.randn(len(u2c.key), 2).astype("float32") + h = H5DataWriter(v_file) h.write(u2c.key, x) u2c.save(key_file) - vcr = VectorClassReader(v_file, key_file, vlist_sep=' ', - csplit_min_spc=1, csplit_max_spc=3, - csplit_overlap=2, csplit_mode='random', - vcr_seed=1024) - + vcr = VectorClassReader( + v_file, + key_file, + vlist_sep=" ", + csplit_min_spc=1, + csplit_max_spc=3, + csplit_overlap=2, + csplit_mode="random", + vcr_seed=1024, + ) + x_test, class_ids_test = vcr.read() print(x_test) print(class_ids_test) - u2c_gt = Utt2Info.create(['0','1','1','1','2','2','2','3','3','3','4','4','4','5','5','5'], - ['0','7','8','9','3','6','1','3','4','5','5','2','6','3','1','5']) - class_ids_gt = np.array([0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5], dtype='int') - indx_gt = np.array([0,7,8,9,3,6,1,3,4,5,5,2,6,3,1,5], dtype='int') + u2c_gt = Utt2Info.create( + [ + "0", + "1", + "1", + "1", + "2", + "2", + "2", + "3", + "3", + "3", + "4", + "4", + "4", + "5", + "5", + "5", + ], + [ + "0", + "7", + "8", + "9", + "3", + "6", + "1", + "3", + "4", + "5", + "5", + "2", + "6", + "3", + "1", + "5", + ], + ) + class_ids_gt = np.array( + [0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5], dtype="int" + ) + indx_gt = np.array([0, 7, 8, 9, 3, 6, 1, 3, 4, 5, 5, 2, 6, 3, 1, 5], dtype="int") x_gt = x[indx_gt, :] print(x_gt) print(class_ids_gt) assert_allclose(x_test, x_gt, rtol=1e-5) - assert np.all(class_ids_test==class_ids_gt) + assert np.all(class_ids_test == class_ids_gt) - -if __name__ == '__main__': - pytest.main([__file__]) - - +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/hyperion/io/test_ark_rw.py b/tests/hyperion/io/test_ark_rw.py index fe9f36dc..aefdca80 100644 --- a/tests/hyperion/io/test_ark_rw.py +++ b/tests/hyperion/io/test_ark_rw.py @@ -14,44 +14,51 @@ from hyperion.io.data_rw_factory import SequentialDataReaderFactory as SDRF from hyperion.io.data_rw_factory import RandomAccessDataReaderFactory as RDRF -input_prefix = './tests/data_in/ark/' -feat_scp_b = 'scp:./tests/data_in/ark/feat_b.scp' -feat_scp_t = 'scp:./tests/data_in/ark/feat_t.scp' -feat_scp_c = ['scp:./tests/data_in/ark/feat_c%d.scp' % i for i in range(1,8)] -feat_scp_uc = ['scp:./tests/data_in/ark/feat_uc%d.scp' % i for i in range(1,8)] -feat_ark_b = 'ark:./tests/data_in/ark/feat1_b.ark' -feat_ark_t = 'ark:./tests/data_in/ark/feat1_t.ark' -feat_ark_c = ['ark:./tests/data_in/ark/feat1_c%d.ark' % i for i in range(1,8)] -feat_ark_uc = ['ark:./tests/data_in/ark/feat1_uc%d.ark' % i for i in range(1,8)] - -feat_range_b = 'scp:./tests/data_in/ark/feat_range_b.scp' -feat_range_c = ['scp:./tests/data_in/ark/feat_range_c%d.scp' % i for i in range(1,8)] - -vec_scp_b = 'scp:./tests/data_in/ark/vec_b.scp' -vec_scp_t = 'scp:./tests/data_in/ark/vec_t.scp' -vec_ark_b = 'ark:./tests/data_in/ark/vec1_b.ark' -vec_ark_t = 'ark:./tests/data_in/ark/vec1_t.ark' - -feat_ark_bo = 'ark:./tests/data_out/ark/feat.ark' -feat_scp_bo = 'scp:./tests/data_out/ark/feat.scp' -feat_ark_to = 'ark,t:./tests/data_out/ark/feat.ark' -feat_scp_co = ['scp:./tests/data_out/ark/feat_c%d.scp' % i for i in range(1,8)] -feat_both_bo = 'ark,scp:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp' -feat_both_to = 'ark,scp,t:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp' -feat_both_bfo = 'ark,scp,f:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp' -feat_both_tfo = 'ark,scp,t,f:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp' -feat_both_co = ['ark,scp:./tests/data_out/ark/feat_%d.ark,./tests/data_out/ark/feat_c%d.scp' % (i,i) for i in range(1,8)] - -vec_ark_bo = 'ark:./tests/data_out/ark/feat.ark' -vec_scp_bo = 'scp:./tests/data_out/ark/feat.scp' -vec_ark_to = 'ark,t:./tests/data_out/ark/feat.ark' -vec_both_bo = 'ark,scp:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp' -vec_both_to = 'ark,scp,t:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp' +input_prefix = "./tests/data_in/ark/" +feat_scp_b = "scp:./tests/data_in/ark/feat_b.scp" +feat_scp_t = "scp:./tests/data_in/ark/feat_t.scp" +feat_scp_c = ["scp:./tests/data_in/ark/feat_c%d.scp" % i for i in range(1, 8)] +feat_scp_uc = ["scp:./tests/data_in/ark/feat_uc%d.scp" % i for i in range(1, 8)] +feat_ark_b = "ark:./tests/data_in/ark/feat1_b.ark" +feat_ark_t = "ark:./tests/data_in/ark/feat1_t.ark" +feat_ark_c = ["ark:./tests/data_in/ark/feat1_c%d.ark" % i for i in range(1, 8)] +feat_ark_uc = ["ark:./tests/data_in/ark/feat1_uc%d.ark" % i for i in range(1, 8)] + +feat_range_b = "scp:./tests/data_in/ark/feat_range_b.scp" +feat_range_c = ["scp:./tests/data_in/ark/feat_range_c%d.scp" % i for i in range(1, 8)] + +vec_scp_b = "scp:./tests/data_in/ark/vec_b.scp" +vec_scp_t = "scp:./tests/data_in/ark/vec_t.scp" +vec_ark_b = "ark:./tests/data_in/ark/vec1_b.ark" +vec_ark_t = "ark:./tests/data_in/ark/vec1_t.ark" + +feat_ark_bo = "ark:./tests/data_out/ark/feat.ark" +feat_scp_bo = "scp:./tests/data_out/ark/feat.scp" +feat_ark_to = "ark,t:./tests/data_out/ark/feat.ark" +feat_scp_co = ["scp:./tests/data_out/ark/feat_c%d.scp" % i for i in range(1, 8)] +feat_both_bo = "ark,scp:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp" +feat_both_to = "ark,scp,t:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp" +feat_both_bfo = "ark,scp,f:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp" +feat_both_tfo = ( + "ark,scp,t,f:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp" +) +feat_both_co = [ + "ark,scp:./tests/data_out/ark/feat_%d.ark,./tests/data_out/ark/feat_c%d.scp" + % (i, i) + for i in range(1, 8) +] + +vec_ark_bo = "ark:./tests/data_out/ark/feat.ark" +vec_scp_bo = "scp:./tests/data_out/ark/feat.scp" +vec_ark_to = "ark,t:./tests/data_out/ark/feat.ark" +vec_both_bo = "ark,scp:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp" +vec_both_to = "ark,scp,t:./tests/data_out/ark/feat.ark,./tests/data_out/ark/feat.scp" compression_methods = compression_methods[:7] # Uncompressed feature files + def test_read_seq_file_feat(): # ark binary @@ -60,7 +67,7 @@ def test_read_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0]) @@ -71,12 +78,12 @@ def test_read_seq_file_feat(): data2 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) @@ -101,11 +108,10 @@ def test_read_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_random_feat(): @@ -121,14 +127,14 @@ def test_read_random_feat(): r = RDRF.create(feat_scp_b, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) # text r = RDRF.create(feat_scp_t, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2, rtol=1e-5) @@ -142,26 +148,25 @@ def test_read_random_feat_permissive(): key1.append(key_i[0]) data1.append(data_i[0]) - key1.append('unk') - + key1.append("unk") + # binary - r = RDRF.create('p,'+feat_scp_b, path_prefix=input_prefix) + r = RDRF.create("p," + feat_scp_b, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2[:-1]): + for d1, d2 in zip(data1, data2[:-1]): assert_allclose(d1, d2) assert data2[-1].shape == (0,) # text - r = RDRF.create('p,'+feat_scp_t, path_prefix=input_prefix) + r = RDRF.create("p," + feat_scp_t, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2[:-1]): + for d1, d2 in zip(data1, data2[:-1]): assert_allclose(d1, d2, rtol=1e-5) assert data2[-1].shape == (0,) - def test_read_seq_scp_split_feat(): # scp binary @@ -176,18 +181,18 @@ def test_read_seq_scp_split_feat(): key2 = [] data2 = [] for i in range(4): - r = SDRF.create(feat_scp_b, path_prefix=input_prefix, - part_idx=i+1, num_parts=4) + r = SDRF.create( + feat_scp_b, path_prefix=input_prefix, part_idx=i + 1, num_parts=4 + ) key_i, data_i = r.read(0) key2 = key2 + key_i data2 = data2 + data_i - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - - + def test_write_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -202,7 +207,7 @@ def test_write_feat(): w = DWF.create(feat_both_bo) w.write(key1, data1) w.close() - + r = SDRF.create(feat_scp_bo) key2 = [] data2 = [] @@ -211,16 +216,15 @@ def test_write_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) - # text w = DWF.create(feat_both_to) w.write(key1, data1) w.close() - + r = SDRF.create(feat_scp_bo) key2 = [] data2 = [] @@ -229,16 +233,15 @@ def test_write_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 # i = np.isclose(d1,d2) == False # print(d1[i]) - # print(d2[i]) + # print(d2[i]) assert_allclose(d1, d2, rtol=1e-4) - def test_write_flush_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -253,7 +256,7 @@ def test_write_flush_feat(): w = DWF.create(feat_both_bfo) w.write(key1, data1) w.close() - + r = SDRF.create(feat_scp_bo) key2 = [] data2 = [] @@ -262,16 +265,15 @@ def test_write_flush_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) - # text w = DWF.create(feat_both_tfo) w.write(key1, data1) w.close() - + r = SDRF.create(feat_scp_bo) key2 = [] data2 = [] @@ -280,16 +282,15 @@ def test_write_flush_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 # i = np.isclose(d1,d2) == False # print(d1[i]) - # print(d2[i]) + # print(d2[i]) assert_allclose(d1, d2, rtol=1e-4) - def test_with_read_seq_file_feat(): # ark binary @@ -299,7 +300,7 @@ def test_with_read_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0]) @@ -310,17 +311,16 @@ def test_with_read_seq_file_feat(): with SDRF.create(feat_ark_b, path_prefix=input_prefix) as r: while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_with_read_seq_scp_feat(): # scp binary @@ -342,11 +342,10 @@ def test_with_read_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_with_read_random_feat(): @@ -358,14 +357,13 @@ def test_with_read_random_feat(): key1.append(key_i[0]) data1.append(data_i[0]) - # binary with + # binary with with RDRF.create(feat_scp_b, path_prefix=input_prefix) as r: data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - - + def test_with_write_feat(): @@ -380,7 +378,7 @@ def test_with_write_feat(): # binary with with DWF.create(feat_both_bo) as w: w.write(key1, data1) - + r = SDRF.create(feat_scp_bo) key2 = [] data2 = [] @@ -389,12 +387,11 @@ def test_with_write_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) - def test_read_iterator_seq_file_feat(): # ark binary @@ -403,42 +400,39 @@ def test_read_iterator_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0]) - r = SDRF.create(feat_ark_b, path_prefix=input_prefix) key2 = [] data2 = [] for key_i, data_i in r: - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i) data2.append(data_i) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - # ark text r = SDRF.create(feat_ark_t, path_prefix=input_prefix) key2 = [] data2 = [] for key_i, data_i in r: - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i) data2.append(data_i) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_read_iterator_seq_scp_feat(): # scp binary @@ -457,11 +451,10 @@ def test_read_iterator_seq_scp_feat(): key2.append(key_i) data2.append(data_i) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - # scp text r = SDRF.create(feat_scp_t, path_prefix=input_prefix) key2 = [] @@ -470,12 +463,11 @@ def test_read_iterator_seq_scp_feat(): key2.append(key_i) data2.append(data_i) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_reset_seq_file_feat(): # ark binary @@ -484,7 +476,7 @@ def test_reset_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0]) @@ -495,17 +487,16 @@ def test_reset_seq_file_feat(): data2 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_reset_seq_scp_feat(): # scp binary @@ -526,12 +517,11 @@ def test_reset_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_shapes_seq_file_feat(): # ark binary @@ -540,28 +530,26 @@ def test_read_shapes_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(feat_ark_b, path_prefix=input_prefix) key2 = [] data2 = [] while not r.eof(): key_i, data_i = r.read_shapes(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_shapes_seq_scp_feat(): # scp binary @@ -573,7 +561,6 @@ def test_read_shapes_seq_scp_feat(): key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(feat_scp_b, path_prefix=input_prefix) key2 = [] data2 = [] @@ -582,11 +569,10 @@ def test_read_shapes_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_shapes_random_feat(): @@ -601,11 +587,10 @@ def test_read_shapes_random_feat(): r = RDRF.create(feat_scp_b, path_prefix=input_prefix) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_num_rows_seq_file_feat(): # ark binary @@ -614,7 +599,7 @@ def test_read_num_rows_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape[0]) @@ -624,17 +609,16 @@ def test_read_num_rows_seq_file_feat(): data2 = [] while not r.eof(): key_i, data_i = r.read_num_rows(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_num_rows_seq_scp_feat(): # scp binary @@ -646,7 +630,6 @@ def test_read_num_rows_seq_scp_feat(): key1.append(key_i[0]) data1.append(data_i[0].shape[0]) - r = SDRF.create(feat_scp_b, path_prefix=input_prefix) key2 = [] data2 = [] @@ -655,11 +638,10 @@ def test_read_num_rows_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_num_rows_random_feat(): @@ -674,11 +656,10 @@ def test_read_num_rows_random_feat(): r = RDRF.create(feat_scp_b, path_prefix=input_prefix) data2 = r.read_num_rows(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_dims_seq_file_feat(): # ark binary @@ -687,28 +668,26 @@ def test_read_dims_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape[1]) - r = SDRF.create(feat_ark_b, path_prefix=input_prefix) key2 = [] data2 = [] while not r.eof(): key_i, data_i = r.read_dims(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_dims_seq_scp_feat(): # scp binary @@ -720,7 +699,6 @@ def test_read_dims_seq_scp_feat(): key1.append(key_i[0]) data1.append(data_i[0].shape[1]) - r = SDRF.create(feat_scp_b, path_prefix=input_prefix) key2 = [] data2 = [] @@ -729,11 +707,10 @@ def test_read_dims_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_dims_random_feat(): @@ -748,11 +725,10 @@ def test_read_dims_random_feat(): r = RDRF.create(feat_scp_b, path_prefix=input_prefix) data2 = r.read_dims(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_range_seq_scp_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -762,7 +738,7 @@ def test_read_range_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 r = SDRF.create(feat_range_b, path_prefix=input_prefix) @@ -773,11 +749,10 @@ def test_read_range_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_range_random_feat(): @@ -788,18 +763,17 @@ def test_read_range_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 - + # binary r = RDRF.create(feat_range_b, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_range_shapes_seq_scp_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -809,17 +783,16 @@ def test_read_range_shapes_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 r = SDRF.create(feat_range_b, path_prefix=input_prefix) key2, data2 = r.read_shapes(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_range_shapes_random_feat(): @@ -830,18 +803,17 @@ def test_read_range_shapes_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 - + # binary r = RDRF.create(feat_range_b, path_prefix=input_prefix) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_range2_seq_file_feat(): # ark binary @@ -851,10 +823,10 @@ def test_read_range2_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 r = SDRF.create(feat_ark_b, path_prefix=input_prefix) @@ -863,19 +835,18 @@ def test_read_range2_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1, row_offset=i, num_rows=10) - if len(key_i)==0: + if len(key_i) == 0: break print(key_i[0]) key2.append(key_i[0]) data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - - + def test_read_range2_seq_scp_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -885,7 +856,7 @@ def test_read_range2_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -898,11 +869,10 @@ def test_read_range2_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_range2_random_feat(): @@ -913,19 +883,18 @@ def test_read_range2_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + # binary r = RDRF.create(feat_scp_b, path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_rangex2_seq_scp_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -935,7 +904,7 @@ def test_read_rangex2_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 r = SDRF.create(feat_range_b, path_prefix=input_prefix) @@ -948,11 +917,10 @@ def test_read_rangex2_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_rangex2_random_feat(): @@ -963,19 +931,18 @@ def test_read_rangex2_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 - + # binary r = RDRF.create(feat_range_b, path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_squeeze_random_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -985,9 +952,9 @@ def test_read_squeeze_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + # binary r = RDRF.create(feat_scp_b, path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] @@ -995,7 +962,7 @@ def test_read_squeeze_random_feat(): assert isinstance(data2, np.ndarray) assert data2.ndim == 3 - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) @@ -1008,24 +975,22 @@ def test_read_squeeze_random_feat_permissive(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + # binary - key1.append('unk') - r = RDRF.create('p,'+feat_scp_b, path_prefix=input_prefix) + key1.append("unk") + r = RDRF.create("p," + feat_scp_b, path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, squeeze=True, row_offset=row_offset, num_rows=10) assert isinstance(data2, np.ndarray) assert data2.ndim == 3 - for d1,d2 in zip(data1, data2[:-1]): + for d1, d2 in zip(data1, data2[:-1]): assert_allclose(d1, d2) assert_allclose(data2[-1], np.zeros(data2[0].shape)) - - def test_write_squeeze_feat(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -1042,7 +1007,7 @@ def test_write_squeeze_feat(): w = DWF.create(feat_both_bo) w.write(key1, data1s) w.close() - + r = SDRF.create(feat_scp_bo) key2 = [] data2 = [] @@ -1051,16 +1016,15 @@ def test_write_squeeze_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) - # Compressed feature files -def test_read_compress_seq_file_feat(): +def test_read_compress_seq_file_feat(): for i, cm in enumerate(compression_methods): # ark uncompressed binary @@ -1087,10 +1051,15 @@ def test_read_compress_seq_file_feat(): # key2.append(key_i[0]) # data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_compress_seq_scp_feat(): @@ -1104,13 +1073,17 @@ def test_read_compress_seq_scp_feat(): r = SDRF.create(feat_scp_c[i], path_prefix=input_prefix) key2, data2 = r.read(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_compress_random_feat(): for i, cm in enumerate(compression_methods): @@ -1122,10 +1095,14 @@ def test_read_compress_random_feat(): r = RDRF.create(feat_scp_c[i], path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_write_compress_feat(): @@ -1135,7 +1112,7 @@ def test_write_compress_feat(): for i, cm in enumerate(compression_methods): # write compressed - print('') + print("") w = DWF.create(feat_both_co[i], compress=True, compression_method=cm) w.write(key1, data1) w.close() @@ -1148,22 +1125,21 @@ def test_write_compress_feat(): r = SDRF.create(feat_scp_co[i]) key2, data2 = r.read(0) - for d1,d1c,d2 in zip(data1, data1c, data2): - #idx = np.argmin(np.abs(d1)) - #atol = np.abs(d1.ravel()[idx]-d1c.ravel()[idx]) - #rtol = np.max(np.abs(np.abs(d1-d1c)-atol)/np.abs(d1)) - #f = np.isclose(d1, d2, rtol=rtol, atol=atol) == False - err11c = np.abs(d1-d1c) + np.abs(d1)*0.001 - err1c2 = np.abs(d1c-d2) - err12 = np.abs(d1-d2) - + for d1, d1c, d2 in zip(data1, data1c, data2): + # idx = np.argmin(np.abs(d1)) + # atol = np.abs(d1.ravel()[idx]-d1c.ravel()[idx]) + # rtol = np.max(np.abs(np.abs(d1-d1c)-atol)/np.abs(d1)) + # f = np.isclose(d1, d2, rtol=rtol, atol=atol) == False + err11c = np.abs(d1 - d1c) + np.abs(d1) * 0.001 + err1c2 = np.abs(d1c - d2) + err12 = np.abs(d1 - d2) + f = np.logical_and(err11c < err1c2, err11c < err12) - #print(atol, rtol) - for a,b,c in zip(d1[f], d1c[f], d2[f]): - print(a,b,c,a-b,b-c,a-c) - - assert not np.any(f), 'Write compression %s failed' % cm + # print(atol, rtol) + for a, b, c in zip(d1[f], d1c[f], d2[f]): + print(a, b, c, a - b, b - c, a - c) + assert not np.any(f), "Write compression %s failed" % cm def test_read_shapes_compress_seq_file_feat(): @@ -1174,29 +1150,27 @@ def test_read_shapes_compress_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape) - for i, cm in enumerate(compression_methods): r = SDRF.create(feat_ark_c[i], path_prefix=input_prefix) key2 = [] data2 = [] while not r.eof(): key_i, data_i = r.read_shapes(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2, 'Wrong key for method %s' % cm - assert d1 == d2, 'Wrong shape for method %s' % cm + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2, "Wrong key for method %s" % cm + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_shapes_compress_seq_scp_feat(): # scp binary @@ -1217,11 +1191,10 @@ def test_read_shapes_compress_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2, 'Wrong key for method %s' % cm - assert d1 == d2, 'Wrong shape for method %s' % cm + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2, "Wrong key for method %s" % cm + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_shapes_compress_random_feat(): @@ -1237,9 +1210,8 @@ def test_read_shapes_compress_random_feat(): r = RDRF.create(feat_scp_c[i], path_prefix=input_prefix) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): - assert d1 == d2, 'Wrong shape for method %s' % cm - + for d1, d2 in zip(data1, data2): + assert d1 == d2, "Wrong shape for method %s" % cm def test_read_range_compress_seq_scp_feat(): @@ -1253,19 +1225,23 @@ def test_read_range_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 # scp compressed print(feat_range_c[i]) r = SDRF.create(feat_range_c[k], path_prefix=input_prefix) key2, data2 = r.read(0) - - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2 + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_range_compress_random_feat(): @@ -1279,18 +1255,21 @@ def test_read_range_compress_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 - + # scp compressed binary r = RDRF.create(feat_range_c[k], path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_range_shapes_compress_seq_scp_feat(): @@ -1302,18 +1281,17 @@ def test_read_range_shapes_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 for k, cm in enumerate(compression_methods): r = SDRF.create(feat_range_c[k], path_prefix=input_prefix) key2, data2 = r.read_shapes(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2, 'Wrong key for method %s' % cm - assert d1 == d2, 'Wrong shape for method %s' % cm + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2, "Wrong key for method %s" % cm + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_range_shapes_compress_random_feat(): @@ -1324,16 +1302,15 @@ def test_read_range_shapes_compress_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 for k, cm in enumerate(compression_methods): # compressed binary r = RDRF.create(feat_range_c[k], path_prefix=input_prefix) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): - assert d1 == d2, 'Wrong shape for method %s' % cm - + for d1, d2 in zip(data1, data2): + assert d1 == d2, "Wrong shape for method %s" % cm def test_read_range2_compress_seq_file_feat(): @@ -1346,12 +1323,12 @@ def test_read_range2_compress_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + # ark compressed binary r = SDRF.create(feat_ark_c[k], path_prefix=input_prefix) key2 = [] @@ -1359,19 +1336,23 @@ def test_read_range2_compress_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1, row_offset=i, num_rows=10) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_range2_compress_seq_scp_feat(): for k, cm in enumerate(compression_methods): @@ -1382,7 +1363,7 @@ def test_read_range2_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 r = SDRF.create(feat_scp_c[k], path_prefix=input_prefix) @@ -1395,12 +1376,16 @@ def test_read_range2_compress_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_range2_compress_random_feat(): @@ -1412,17 +1397,21 @@ def test_read_range2_compress_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + r = RDRF.create(feat_scp_c[k], path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_rangex2_compress_seq_scp_feat(): @@ -1435,7 +1424,7 @@ def test_read_rangex2_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 r = SDRF.create(feat_range_c[k], path_prefix=input_prefix) @@ -1448,12 +1437,16 @@ def test_read_rangex2_compress_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_compress_rangex2_random_feat(): @@ -1465,22 +1458,27 @@ def test_read_compress_rangex2_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 - + # binary r = RDRF.create(feat_range_c[k], path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - # Vector files + def test_read_seq_file_vec(): # ark binary @@ -1489,7 +1487,7 @@ def test_read_seq_file_vec(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0]) @@ -1500,17 +1498,16 @@ def test_read_seq_file_vec(): data2 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_read_seq_scp_vec(): # scp binary @@ -1531,11 +1528,10 @@ def test_read_seq_scp_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_random_vec(): @@ -1551,18 +1547,17 @@ def test_read_random_vec(): r = RDRF.create(vec_scp_b, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) # text r = RDRF.create(vec_scp_t, path_prefix=input_prefix) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2, rtol=1e-5) - def test_write_vec(): r = SDRF.create(vec_scp_b, path_prefix=input_prefix) @@ -1577,7 +1572,7 @@ def test_write_vec(): w = DWF.create(vec_both_bo) w.write(key1, data1) w.close() - + r = SDRF.create(vec_scp_bo) key2 = [] data2 = [] @@ -1586,16 +1581,15 @@ def test_write_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) - # text w = DWF.create(vec_both_to) w.write(key1, data1) w.close() - + r = SDRF.create(vec_scp_bo) key2 = [] data2 = [] @@ -1604,16 +1598,15 @@ def test_write_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - i = np.isclose(d1,d2,rtol=1e-4, atol=1e-5) == False + i = np.isclose(d1, d2, rtol=1e-4, atol=1e-5) == False print(d1[i]) - print(d2[i]) + print(d2[i]) assert_allclose(d1, d2, rtol=1e-4, atol=1e-5) - def test_read_shapes_seq_file_vec(): # ark binary @@ -1622,30 +1615,28 @@ def test_read_shapes_seq_file_vec(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(vec_ark_b, path_prefix=input_prefix) key2 = [] data2 = [] while not r.eof(): key_i, data_i = r.read_shapes(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 assert len(d1) == 1 assert len(d2) == 1 - def test_read_shapes_seq_scp_vec(): # scp binary @@ -1657,7 +1648,6 @@ def test_read_shapes_seq_scp_vec(): key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(vec_scp_b, path_prefix=input_prefix) key2 = [] data2 = [] @@ -1666,13 +1656,12 @@ def test_read_shapes_seq_scp_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 assert len(d1) == 1 assert len(d2) == 1 - def test_read_shapes_random_vec(): @@ -1687,13 +1676,12 @@ def test_read_shapes_random_vec(): r = RDRF.create(vec_scp_b, path_prefix=input_prefix) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 assert len(d1) == 1 assert len(d2) == 1 - def test_read_squeeze_random_vec(): r = SDRF.create(vec_scp_b, path_prefix=input_prefix) @@ -1705,7 +1693,7 @@ def test_read_squeeze_random_vec(): key1.append(key_i[0]) data1.append(data_i[0]) i += 1 - + # binary r = RDRF.create(vec_scp_b, path_prefix=input_prefix) row_offset = [i for i in range(len(key1))] @@ -1713,11 +1701,10 @@ def test_read_squeeze_random_vec(): assert isinstance(data2, np.ndarray) assert data2.ndim == 2 - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_squeeze_random_vec_permissive(): r = SDRF.create(vec_scp_b, path_prefix=input_prefix) @@ -1729,21 +1716,19 @@ def test_read_squeeze_random_vec_permissive(): key1.append(key_i[0]) data1.append(data_i[0]) i += 1 - + # binary - key1.append('unk') - r = RDRF.create('p,'+vec_scp_b, path_prefix=input_prefix) + key1.append("unk") + r = RDRF.create("p," + vec_scp_b, path_prefix=input_prefix) data2 = r.read(key1, squeeze=True) assert isinstance(data2, np.ndarray) assert data2.ndim == 2 - for d1,d2 in zip(data1, data2[:-1]): + for d1, d2 in zip(data1, data2[:-1]): assert_allclose(d1, d2) assert_allclose(data2[-1], np.zeros(data2[0].shape)) - - def test_write_squeeze_vec(): r = SDRF.create(vec_scp_b, path_prefix=input_prefix) @@ -1760,7 +1745,7 @@ def test_write_squeeze_vec(): w = DWF.create(vec_both_bo) w.write(key1, data1s) w.close() - + r = SDRF.create(vec_scp_bo) key2 = [] data2 = [] @@ -1769,15 +1754,16 @@ def test_write_squeeze_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) + # read compressed # write compressed # read compressed range x3 # read vector # write vector -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/io/test_audio_rw.py b/tests/hyperion/io/test_audio_rw.py index 79c38667..91b01cd6 100644 --- a/tests/hyperion/io/test_audio_rw.py +++ b/tests/hyperion/io/test_audio_rw.py @@ -9,63 +9,73 @@ from numpy.testing import assert_allclose from hyperion.hyp_defs import set_float_cpu, float_cpu -from hyperion.io import AudioWriter as AW, SequentialAudioReader as SAR, RandomAccessAudioReader as RAR - -audio_path = './tests/data_out/io/audio' -wav_scp_file = audio_path + '/wav.scp' -flac_scp_file = audio_path + '/flac.scp' -pipe_scp_file = audio_path + '/pipe.scp' -segments_file = audio_path + '/segments' +from hyperion.io import ( + AudioWriter as AW, + SequentialAudioReader as SAR, + RandomAccessAudioReader as RAR, +) + +audio_path = "./tests/data_out/io/audio" +wav_scp_file = audio_path + "/wav.scp" +flac_scp_file = audio_path + "/flac.scp" +pipe_scp_file = audio_path + "/pipe.scp" +segments_file = audio_path + "/segments" fs = 16000 + def gen_signals(num_signals=3): rng = np.random.RandomState(seed=1) s = [] keys = [] for i in range(num_signals): s_i = rng.randn(fs) - s_i = ((2**15-1)/np.max(np.abs(s_i))*s_i).astype('int32').astype(float_cpu()) + s_i = ( + ((2 ** 15 - 1) / np.max(np.abs(s_i)) * s_i) + .astype("int32") + .astype(float_cpu()) + ) s.append(s_i) - keys.append('s%d' % i) + keys.append("s%d" % i) return keys, s keys, s = gen_signals() + def gen_segments(num_signals=3, num_segs=2): if not os.path.exists(audio_path): os.makedirs(audio_path) - + keys_seg = [] s_seg = [] - with open(segments_file, 'w') as f: + with open(segments_file, "w") as f: for i in range(num_signals): - file_i = 's%d' % (i) + file_i = "s%d" % (i) for j in range(num_segs): - seg_ij = '%s-%d' % (file_i, j) - tbeg = j*0.1 - tend = (j+1)*0.1 - f.write('%s %s %.2f %.2f\n' % (seg_ij, file_i, tbeg, tend)) + seg_ij = "%s-%d" % (file_i, j) + tbeg = j * 0.1 + tend = (j + 1) * 0.1 + f.write("%s %s %.2f %.2f\n" % (seg_ij, file_i, tbeg, tend)) keys_seg.append(seg_ij) - s_seg.append(s[i][int(tbeg*fs):int(tend*fs)]) + s_seg.append(s[i][int(tbeg * fs) : int(tend * fs)]) return keys_seg, s_seg -keys_seg, s_seg = gen_segments() +keys_seg, s_seg = gen_segments() def test_write_audio_files_wav(): - with AW(audio_path, wav_scp_file, 'wav') as w: + with AW(audio_path, wav_scp_file, "wav") as w: w.write(keys, s, fs) def test_write_audio_files_flac(): - with AW(audio_path, flac_scp_file, 'flac') as w: + with AW(audio_path, flac_scp_file, "flac") as w: w.write(keys, s, fs) @@ -74,38 +84,37 @@ def test_read_sar_wav(): with SAR(wav_scp_file) as r: keys1, s1, fs1 = r.read() - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): assert_allclose(s_i, s1_i, atol=1) - - + + def test_read_sar_flac(): with SAR(flac_scp_file) as r: keys1, s1, fs1 = r.read() - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): assert_allclose(s_i, s1_i, atol=1) - - + def test_read_sar_pipe(): - with open(pipe_scp_file,'w') as f: + with open(pipe_scp_file, "w") as f: for i, k in enumerate(keys): - f.write('%s sox %s/%s.flac -t wav - |\n' %(k, audio_path, k)) + f.write("%s sox %s/%s.flac -t wav - |\n" % (k, audio_path, k)) with SAR(pipe_scp_file) as r: keys1, s1, fs1 = r.read() - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): assert_allclose(s_i, s1_i, atol=1) @@ -133,9 +142,9 @@ def test_read_sar_wav_with_segments(): with SAR(wav_scp_file, segments_file) as r: keys1, s1, fs1 = r.read() - for k_i, k1_i in zip(keys_seg,keys1): + for k_i, k1_i in zip(keys_seg, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s_seg, s1): assert_allclose(s_i, s1_i, atol=1) diff --git a/tests/hyperion/io/test_copy_feats.py b/tests/hyperion/io/test_copy_feats.py index 8a0073ed..ae5e6619 100644 --- a/tests/hyperion/io/test_copy_feats.py +++ b/tests/hyperion/io/test_copy_feats.py @@ -7,28 +7,30 @@ from hyperion.io.copy_feats import CopyFeats as CF -input_prefix = './tests/data_in/ark/' -feat_scp_b = 'scp:./tests/data_in/ark/feat_b.scp' -feat_ark_b = ['ark:./tests/data_in/ark/feat%d_b.ark' % i for i in range(1,3)] -feat_both_ho = 'h5,scp:./tests/data_out/h5/feat_cp.h5,./tests/data_out/h5/feat_cp.scp' +input_prefix = "./tests/data_in/ark/" +feat_scp_b = "scp:./tests/data_in/ark/feat_b.scp" +feat_ark_b = ["ark:./tests/data_in/ark/feat%d_b.ark" % i for i in range(1, 3)] +feat_both_ho = "h5,scp:./tests/data_out/h5/feat_cp.h5,./tests/data_out/h5/feat_cp.scp" def test_copy_feats(): - CF(feat_scp_b, feat_both_ho, - path_prefix=input_prefix, compress=True) + CF(feat_scp_b, feat_both_ho, path_prefix=input_prefix, compress=True) def test_merge_feats(): - CF(feat_ark_b, feat_both_ho, - path_prefix=input_prefix, compress=True) + CF(feat_ark_b, feat_both_ho, path_prefix=input_prefix, compress=True) def test_split_feats(): - CF(feat_scp_b, feat_both_ho, - path_prefix=input_prefix, compress=True, - part_idx=2, num_parts=3) + CF( + feat_scp_b, + feat_both_ho, + path_prefix=input_prefix, + compress=True, + part_idx=2, + num_parts=3, + ) - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/io/test_h5_rw.py b/tests/hyperion/io/test_h5_rw.py index 552617e0..3d66a861 100644 --- a/tests/hyperion/io/test_h5_rw.py +++ b/tests/hyperion/io/test_h5_rw.py @@ -15,46 +15,56 @@ from hyperion.io.data_rw_factory import SequentialDataReaderFactory as SDRF from hyperion.io.data_rw_factory import RandomAccessDataReaderFactory as RDRF -input_prefix = './tests/data_in/ark/' -feat_scp_b = 'scp:./tests/data_in/ark/feat_b.scp' -feat_scp_c = ['scp:./tests/data_in/ark/feat_c%d.scp' % i for i in range(1,8)] -feat_ark_b = ['ark:./tests/data_in/ark/feat%d_b.ark' % i for i in range(1,3)] - -vec_scp_b = 'scp:./tests/data_in/ark/vec_b.scp' - -feat_h5_ho = ['h5:./tests/data_out/h5/feat%d.h5' %i for i in range(1,3)] -feat_scp_ho1 = ['./tests/data_out/h5/feat%d.scp' %i for i in range(1,3)] -feat_scp_ho2 = './tests/data_out/h5/feat.scp' -feat_scp_ho = 'scp:./tests/data_out/h5/feat.scp' -feat_range_ho1 = './tests/data_out/h5/feat_range.scp' -feat_range_ho = 'scp:./tests/data_out/h5/feat_range.scp' - -feat_both_ho = ['h5,scp:./tests/data_out/h5/feat%d.h5,./tests/data_out/h5/feat%d.scp' % - (i,i) for i in range(1,3)] -feat_scp_hso = 'scp:./tests/data_out/h5/feat_squeeze.scp' -feat_both_hso = 'h5,scp:./tests/data_out/h5/feat_squeeze.h5,./tests/data_out/h5/feat_squeeze.scp' - - -feat_scp_co = ['scp:./tests/data_out/ark/feat_c%d.scp' % i for i in range(1,8)] -feat_scp_hco = ['scp:./tests/data_out/h5/feat_c%d.scp' % i for i in range(1,8)] -feat_h5_hco = ['h5:./tests/data_out/h5/feat_c%d.h5' % i for i in range(1,8)] -feat_both_hco = ['h5,scp:./tests/data_out/h5/feat_c%d.h5,./tests/data_out/h5/feat_c%d.scp' % (i,i) for i in range(1,8)] - -feat_scp_hco1 = ['./tests/data_out/h5/feat_c%d.scp' % i for i in range(1,8)] -feat_range_hco = ['scp:./tests/data_out/h5/feat_range_c%d.scp' % i for i in range(1,8)] -feat_range_hco1 = ['./tests/data_out/h5/feat_range_c%d.scp' % i for i in range(1,8)] - -vec_h5_ho = 'h5:./tests/data_out/h5/vec.h5' -vec_scp_ho = 'scp:./tests/data_out/h5/vec.scp' -vec_both_ho = 'h5,scp:./tests/data_out/h5/vec.h5,./tests/data_out/h5/vec.scp' - -vec_scp_hso = 'scp:./tests/data_out/h5/vec_squeeze.scp' -vec_both_hso = 'h5,scp:./tests/data_out/h5/vec_squeeze.h5,./tests/data_out/h5/vec_squeeze.scp' +input_prefix = "./tests/data_in/ark/" +feat_scp_b = "scp:./tests/data_in/ark/feat_b.scp" +feat_scp_c = ["scp:./tests/data_in/ark/feat_c%d.scp" % i for i in range(1, 8)] +feat_ark_b = ["ark:./tests/data_in/ark/feat%d_b.ark" % i for i in range(1, 3)] + +vec_scp_b = "scp:./tests/data_in/ark/vec_b.scp" + +feat_h5_ho = ["h5:./tests/data_out/h5/feat%d.h5" % i for i in range(1, 3)] +feat_scp_ho1 = ["./tests/data_out/h5/feat%d.scp" % i for i in range(1, 3)] +feat_scp_ho2 = "./tests/data_out/h5/feat.scp" +feat_scp_ho = "scp:./tests/data_out/h5/feat.scp" +feat_range_ho1 = "./tests/data_out/h5/feat_range.scp" +feat_range_ho = "scp:./tests/data_out/h5/feat_range.scp" + +feat_both_ho = [ + "h5,scp:./tests/data_out/h5/feat%d.h5,./tests/data_out/h5/feat%d.scp" % (i, i) + for i in range(1, 3) +] +feat_scp_hso = "scp:./tests/data_out/h5/feat_squeeze.scp" +feat_both_hso = ( + "h5,scp:./tests/data_out/h5/feat_squeeze.h5,./tests/data_out/h5/feat_squeeze.scp" +) + + +feat_scp_co = ["scp:./tests/data_out/ark/feat_c%d.scp" % i for i in range(1, 8)] +feat_scp_hco = ["scp:./tests/data_out/h5/feat_c%d.scp" % i for i in range(1, 8)] +feat_h5_hco = ["h5:./tests/data_out/h5/feat_c%d.h5" % i for i in range(1, 8)] +feat_both_hco = [ + "h5,scp:./tests/data_out/h5/feat_c%d.h5,./tests/data_out/h5/feat_c%d.scp" % (i, i) + for i in range(1, 8) +] + +feat_scp_hco1 = ["./tests/data_out/h5/feat_c%d.scp" % i for i in range(1, 8)] +feat_range_hco = ["scp:./tests/data_out/h5/feat_range_c%d.scp" % i for i in range(1, 8)] +feat_range_hco1 = ["./tests/data_out/h5/feat_range_c%d.scp" % i for i in range(1, 8)] + +vec_h5_ho = "h5:./tests/data_out/h5/vec.h5" +vec_scp_ho = "scp:./tests/data_out/h5/vec.scp" +vec_both_ho = "h5,scp:./tests/data_out/h5/vec.h5,./tests/data_out/h5/vec.scp" + +vec_scp_hso = "scp:./tests/data_out/h5/vec_squeeze.scp" +vec_both_hso = ( + "h5,scp:./tests/data_out/h5/vec_squeeze.h5,./tests/data_out/h5/vec_squeeze.scp" +) compression_methods = compression_methods[:7] # Uncompressed feature files + def test_write_read_seq_file_feat(): for k in range(2): @@ -65,7 +75,7 @@ def test_write_read_seq_file_feat(): w = DWF.create(feat_both_ho[k]) w.write(key1, data1) w.close() - + r = SDRF.create(feat_h5_ho[k]) key2 = [] data2 = [] @@ -74,63 +84,58 @@ def test_write_read_seq_file_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - f, loc = ismember(key1, key2) assert np.all(f) - for i, (k1, d1) in enumerate(zip(key1,data1)): + for i, (k1, d1) in enumerate(zip(key1, data1)): assert k1 == key2[loc[i]] assert_allclose(d1, data2[loc[i]]) - with open(feat_scp_ho2, 'w') as fw: + with open(feat_scp_ho2, "w") as fw: for k in range(2): - with open(feat_scp_ho1[k], 'r') as fr: + with open(feat_scp_ho1[k], "r") as fr: for l in fr: fw.write(l) - def test_write_flush_feat(): - r = SDRF.create(feat_ark_b[0], path_prefix=input_prefix) key1, data1 = r.read(0) - + # write - w = DWF.create('f,'+feat_h5_ho[0]) + w = DWF.create("f," + feat_h5_ho[0]) w.write(key1, data1) w.close() - + r = SDRF.create(feat_h5_ho[0]) key2, data2 = r.read(0) f, loc = ismember(key1, key2) assert np.all(f) - for i, (k1, d1) in enumerate(zip(key1,data1)): + for i, (k1, d1) in enumerate(zip(key1, data1)): assert k1 == key2[loc[i]] assert_allclose(d1, data2[loc[i]]) - def test_with_write_feat(): r = SDRF.create(feat_ark_b[0], path_prefix=input_prefix) key1, data1 = r.read(0) - + # write with DWF.create(feat_h5_ho[0]) as w: w.write(key1, data1) - + r = SDRF.create(feat_h5_ho[0]) key2, data2 = r.read(0) f, loc = ismember(key1, key2) assert np.all(f) - for i, (k1, d1) in enumerate(zip(key1,data1)): + for i, (k1, d1) in enumerate(zip(key1, data1)): assert k1 == key2[loc[i]] assert_allclose(d1, data2[loc[i]]) - def test_read_seq_scp_feat(): # ark binary @@ -146,12 +151,11 @@ def test_read_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_random_file_feat(): r = SDRF.create(feat_h5_ho[0]) @@ -160,11 +164,10 @@ def test_read_random_file_feat(): r = RDRF.create(feat_h5_ho[0]) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_random_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -173,41 +176,38 @@ def test_read_random_scp_feat(): r = RDRF.create(feat_scp_ho) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_random_file_feat_permissive(): r = SDRF.create(feat_h5_ho[0]) key1, data1 = r.read(0) - key1.append('unk') + key1.append("unk") data1.append(np.array([])) - r = RDRF.create('p,'+feat_h5_ho[0]) + r = RDRF.create("p," + feat_h5_ho[0]) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_random_scp_feat_permissive(): r = SDRF.create(feat_scp_ho) key1, data1 = r.read(0) - key1.append('unk') + key1.append("unk") data1.append(np.array([])) - r = RDRF.create('p,'+feat_scp_ho) + r = RDRF.create("p," + feat_scp_ho) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_seq_file_split_feat(): r = SDRF.create(feat_h5_ho[0]) @@ -216,17 +216,16 @@ def test_read_seq_file_split_feat(): key2 = [] data2 = [] for i in range(2): - r = SDRF.create(feat_h5_ho[0], part_idx=i+1, num_parts=2) + r = SDRF.create(feat_h5_ho[0], part_idx=i + 1, num_parts=2) key_i, data_i = r.read(0) key2 += key_i data2 += data_i - - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_seq_scp_split_feat(): r = SDRF.create(feat_scp_ho) @@ -235,85 +234,79 @@ def test_read_seq_scp_split_feat(): key2 = [] data2 = [] for i in range(4): - r = SDRF.create(feat_scp_ho, part_idx=i+1, num_parts=4) + r = SDRF.create(feat_scp_ho, part_idx=i + 1, num_parts=4) key_i, data_i = r.read(0) key2 += key_i data2 += data_i - - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_with_read_seq_file_feat(): # without with r = SDRF.create(feat_h5_ho[0]) key1, data1 = r.read(0) - + # with with with SDRF.create(feat_h5_ho[0]) as r: key2, data2 = r.read(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_with_read_seq_scp_feat(): # without with r = SDRF.create(feat_scp_ho) key1, data1 = r.read(0) - + # with with with SDRF.create(feat_scp_ho) as r: key2, data2 = r.read(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - - def test_with_read_random_file_feat(): # without with r = SDRF.create(feat_h5_ho[0]) key1, data1 = r.read(0) - + # with with with RDRF.create(feat_h5_ho[0]) as r: data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_with_read_random_scp_feat(): # without with r = SDRF.create(feat_scp_ho) key1, data1 = r.read(0) - + # with with with RDRF.create(feat_scp_ho) as r: data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_iterator_seq_file_feat(): r = SDRF.create(feat_h5_ho[0]) key1, data1 = r.read(0) - + r = SDRF.create(feat_h5_ho[0]) key2 = [] data2 = [] @@ -322,12 +315,11 @@ def test_read_iterator_seq_file_feat(): data2.append(data_i) print(key1) print(key2) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - - + def test_read_iterator_seq_scp_feat(): # scp binary @@ -341,11 +333,10 @@ def test_read_iterator_seq_scp_feat(): key2.append(key_i) data2.append(data_i) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_reset_seq_file_feat(): @@ -358,17 +349,16 @@ def test_reset_seq_file_feat(): data2 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - def test_reset_seq_scp_feat(): # scp binary @@ -384,12 +374,11 @@ def test_reset_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_shapes_seq_file_feat(): r = SDRF.create(feat_h5_ho[0]) @@ -397,28 +386,26 @@ def test_read_shapes_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(feat_h5_ho[0]) key2 = [] data2 = [] while not r.eof(): key_i, data_i = r.read_shapes(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_shapes_seq_scp_feat(): # scp binary @@ -430,7 +417,6 @@ def test_read_shapes_seq_scp_feat(): key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(feat_scp_ho) key2 = [] data2 = [] @@ -439,12 +425,11 @@ def test_read_shapes_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_shapes_random_file_feat(): r = SDRF.create(feat_h5_ho[0]) @@ -458,11 +443,10 @@ def test_read_shapes_random_file_feat(): r = RDRF.create(feat_h5_ho[0]) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_shapes_random_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -476,11 +460,10 @@ def test_read_shapes_random_scp_feat(): r = RDRF.create(feat_scp_ho) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_num_rows_seq_file_feat(): r = SDRF.create(feat_h5_ho[0]) @@ -488,7 +471,7 @@ def test_read_num_rows_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape[0]) @@ -496,12 +479,11 @@ def test_read_num_rows_seq_file_feat(): r = SDRF.create(feat_h5_ho[0]) key2, data2 = r.read_num_rows(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_num_rows_seq_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -512,15 +494,13 @@ def test_read_num_rows_seq_scp_feat(): key1.append(key_i[0]) data1.append(data_i[0].shape[0]) - r = SDRF.create(feat_scp_ho) key2, data2 = r.read_num_rows(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_num_rows_random_file_feat(): @@ -535,11 +515,10 @@ def test_read_num_rows_random_file_feat(): r = RDRF.create(feat_h5_ho[0]) data2 = r.read_num_rows(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_num_rows_random_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -553,11 +532,10 @@ def test_read_num_rows_random_scp_feat(): r = RDRF.create(feat_scp_ho) data2 = r.read_num_rows(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_dims_seq_file_feat(): # ark binary @@ -566,21 +544,19 @@ def test_read_dims_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape[1]) - r = SDRF.create(feat_h5_ho[0]) key2, data2 = r.read_dims(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_dims_seq_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -591,15 +567,13 @@ def test_read_dims_seq_scp_feat(): key1.append(key_i[0]) data1.append(data_i[0].shape[1]) - r = SDRF.create(feat_scp_ho) key2, data2 = r.read_dims(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_dims_random_file_feat(): @@ -614,11 +588,10 @@ def test_read_dims_random_file_feat(): r = RDRF.create(feat_h5_ho[0]) data2 = r.read_dims(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_dims_random_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -632,20 +605,19 @@ def test_read_dims_random_scp_feat(): r = RDRF.create(feat_scp_ho) data2 = r.read_dims(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_range_seq_scp_feat(): - with open(feat_range_ho1, 'w') as w: - with open(feat_scp_ho2, 'r') as r: + with open(feat_range_ho1, "w") as w: + with open(feat_scp_ho2, "r") as r: i = 0 for l in r: - w.write('%s[%d:%d]\n' % (l.strip(), i, i+50)) + w.write("%s[%d:%d]\n" % (l.strip(), i, i + 50)) i += 1 - + r = SDRF.create(feat_scp_ho) key1 = [] data1 = [] @@ -653,7 +625,7 @@ def test_read_range_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 r = SDRF.create(feat_range_ho) @@ -664,11 +636,10 @@ def test_read_range_seq_scp_feat(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_range_random_scp_feat(): @@ -679,18 +650,17 @@ def test_read_range_random_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 - + # binary r = RDRF.create(feat_range_ho) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_range_shapes_seq_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -700,17 +670,16 @@ def test_read_range_shapes_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 r = SDRF.create(feat_range_ho) key2, data2 = r.read_shapes(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_range_shapes_random_scp_feat(): @@ -721,17 +690,16 @@ def test_read_range_shapes_random_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 - + r = RDRF.create(feat_range_ho) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_range2_seq_file_feat(): # ark binary @@ -741,10 +709,10 @@ def test_read_range2_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 r = SDRF.create(feat_h5_ho[0]) @@ -753,19 +721,18 @@ def test_read_range2_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1, row_offset=i, num_rows=10) - if len(key_i)==0: + if len(key_i) == 0: break print(key_i[0]) key2.append(key_i[0]) data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-5) - - + def test_read_range2_seq_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -775,7 +742,7 @@ def test_read_range2_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 r = SDRF.create(feat_scp_ho) @@ -788,11 +755,10 @@ def test_read_range2_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_range2_random_file_feat(): @@ -803,18 +769,17 @@ def test_read_range2_random_file_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + r = RDRF.create(feat_h5_ho[0]) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_range2_random_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -824,19 +789,18 @@ def test_read_range2_random_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + # binary r = RDRF.create(feat_scp_ho) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_rangex2_seq_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -846,7 +810,7 @@ def test_read_rangex2_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 r = SDRF.create(feat_range_ho) @@ -859,11 +823,10 @@ def test_read_rangex2_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_rangex2_random_scp_feat(): @@ -874,19 +837,18 @@ def test_read_rangex2_random_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 - + # binary r = RDRF.create(feat_range_ho) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_squeeze_random_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -896,20 +858,19 @@ def test_read_squeeze_random_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + r = RDRF.create(feat_scp_ho) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, squeeze=True, row_offset=row_offset, num_rows=10) assert isinstance(data2, np.ndarray) assert data2.ndim == 3 - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_squeeze_random_scp_feat_permissive(): r = SDRF.create(feat_scp_b, path_prefix=input_prefix) @@ -919,23 +880,21 @@ def test_read_squeeze_random_scp_feat_permissive(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - - key1.append('unk') - r = RDRF.create('p,'+feat_scp_ho) + + key1.append("unk") + r = RDRF.create("p," + feat_scp_ho) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, squeeze=True, row_offset=row_offset, num_rows=10) assert isinstance(data2, np.ndarray) assert data2.ndim == 3 - for d1,d2 in zip(data1, data2[:-1]): + for d1, d2 in zip(data1, data2[:-1]): assert_allclose(d1, d2) assert_allclose(data2[-1], np.zeros(data2[0].shape)) - - def test_write_squeeze_feat(): r = SDRF.create(feat_scp_ho) @@ -952,18 +911,18 @@ def test_write_squeeze_feat(): w = DWF.create(feat_both_hso) w.write(key1, data1s) w.close() - + r = SDRF.create(feat_scp_hso) key2, data2 = r.read(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) - # Compressed feature files + def test_write_read_seq_scp_compress_feat(): r = SDRF.create(feat_scp_ho) @@ -983,17 +942,16 @@ def test_write_read_seq_scp_compress_feat(): r = SDRF.create(feat_scp_hco[i]) key2, data2 = r.read(0) - for d1,d1c,d2 in zip(data1, data1c, data2): - err11c = np.abs(d1-d1c) + np.abs(d1)*0.001 - err1c2 = np.abs(d1c-d2) - err12 = np.abs(d1-d2) - + for d1, d1c, d2 in zip(data1, data1c, data2): + err11c = np.abs(d1 - d1c) + np.abs(d1) * 0.001 + err1c2 = np.abs(d1c - d2) + err12 = np.abs(d1 - d2) + f = np.logical_and(err11c < err1c2, err11c < err12) - for a,b,c in zip(d1[f], d1c[f], d2[f]): - print(a,b,c,a-b,b-c,a-c) - - assert not np.any(f), 'Write compression %s failed' % cm + for a, b, c in zip(d1[f], d1c[f], d2[f]): + print(a, b, c, a - b, b - c, a - c) + assert not np.any(f), "Write compression %s failed" % cm def test_read_compress_seq_file_feat(): @@ -1006,13 +964,17 @@ def test_read_compress_seq_file_feat(): key2, data2 = r.read(0) f, loc = ismember(key2, key1) - for i,(k2,d2) in enumerate(zip(key2, data2)): + for i, (k2, d2) in enumerate(zip(key2, data2)): assert key1[loc[i]] == k2 - assert_allclose(data1[loc[i]], d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + data1[loc[i]], + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_compress_random_file_feat(): for i, cm in enumerate(compression_methods): @@ -1022,10 +984,14 @@ def test_read_compress_random_file_feat(): r = RDRF.create(feat_h5_hco[i]) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_compress_random_scp_feat(): @@ -1037,12 +1003,16 @@ def test_read_compress_random_scp_feat(): r = RDRF.create(feat_scp_hco[i]) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_shapes_compress_seq_file_feat(): r = SDRF.create(feat_h5_hco[0]) @@ -1050,22 +1020,20 @@ def test_read_shapes_compress_seq_file_feat(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape) - for i, cm in enumerate(compression_methods): r = SDRF.create(feat_h5_hco[i]) key2, data2 = r.read_shapes(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2, 'Wrong key for method %s' % cm - assert d1 == d2, 'Wrong shape for method %s' % cm + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2, "Wrong key for method %s" % cm + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_shapes_compress_seq_scp_feat(): r = SDRF.create(feat_scp_ho) @@ -1075,11 +1043,10 @@ def test_read_shapes_compress_seq_scp_feat(): r = SDRF.create(feat_scp_hco[i]) key2, data2 = r.read_shapes(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2, 'Wrong key for method %s' % cm - assert d1 == d2, 'Wrong shape for method %s' % cm + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2, "Wrong key for method %s" % cm + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_shapes_compress_random_file_feat(): @@ -1090,11 +1057,10 @@ def test_read_shapes_compress_random_file_feat(): r = RDRF.create(feat_h5_hco[i]) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): - assert d1 == d2, 'Wrong shape for method %s' % cm + for d1, d2 in zip(data1, data2): + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_shapes_compress_random_file_feat(): r = SDRF.create(feat_scp_ho) @@ -1104,20 +1070,18 @@ def test_read_shapes_compress_random_file_feat(): r = RDRF.create(feat_scp_hco[i]) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): - assert d1 == d2, 'Wrong shape for method %s' % cm - + for d1, d2 in zip(data1, data2): + assert d1 == d2, "Wrong shape for method %s" % cm def test_read_range_compress_seq_scp_feat(): - for k, cm in enumerate(compression_methods): - with open(feat_range_hco1[k], 'w') as w: - with open(feat_scp_hco1[k], 'r') as r: + with open(feat_range_hco1[k], "w") as w: + with open(feat_scp_hco1[k], "r") as r: i = 0 for l in r: - w.write('%s[%d:%d]\n' % (l.strip(), i, i+50)) + w.write("%s[%d:%d]\n" % (l.strip(), i, i + 50)) i += 1 r = SDRF.create(feat_scp_hco[k]) @@ -1127,17 +1091,21 @@ def test_read_range_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 r = SDRF.create(feat_range_hco[k]) key2, data2 = r.read(0) - - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2 + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_range_compress_random_feat(): @@ -1151,17 +1119,21 @@ def test_read_range_compress_random_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51]) + data1.append(data_i[0][i : i + 51]) i += 1 - + # scp compressed binary r = RDRF.create(feat_range_hco[k]) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_range_shapes_compress_seq_scp_feat(): @@ -1173,18 +1145,17 @@ def test_read_range_shapes_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 for k, cm in enumerate(compression_methods): r = SDRF.create(feat_range_hco[k]) key2, data2 = r.read_shapes(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): - assert k1 == k2, 'Wrong key for method %s' % cm - assert d1 == d2, 'Wrong shape for method %s' % cm + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): + assert k1 == k2, "Wrong key for method %s" % cm + assert d1 == d2, "Wrong shape for method %s" % cm - def test_read_range_shapes_compress_random_scp_feat(): @@ -1195,15 +1166,14 @@ def test_read_range_shapes_compress_random_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+51].shape) + data1.append(data_i[0][i : i + 51].shape) i += 1 for k, cm in enumerate(compression_methods): r = RDRF.create(feat_range_hco[k]) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): - assert d1 == d2, 'Wrong shape for method %s' % cm - + for d1, d2 in zip(data1, data2): + assert d1 == d2, "Wrong shape for method %s" % cm def test_read_range2_compress_seq_file_feat(): @@ -1215,31 +1185,35 @@ def test_read_range2_compress_seq_file_feat(): i = 0 while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + r = SDRF.create(feat_h5_hco[k]) key2 = [] data2 = [] i = 0 while not r.eof(): key_i, data_i = r.read(1, row_offset=i, num_rows=10) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_range2_compress_seq_scp_feat(): for k, cm in enumerate(compression_methods): @@ -1250,7 +1224,7 @@ def test_read_range2_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 r = SDRF.create(feat_scp_hco[k]) @@ -1263,12 +1237,16 @@ def test_read_range2_compress_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_range2_compress_random_file_feat(): @@ -1280,17 +1258,21 @@ def test_read_range2_compress_random_file_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + r = RDRF.create(feat_h5_hco[k]) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_range2_compress_random_file_feat(): @@ -1303,17 +1285,21 @@ def test_read_range2_compress_random_file_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][i:i+10]) + data1.append(data_i[0][i : i + 10]) i += 1 - + r = RDRF.create(feat_scp_hco[k]) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) def test_read_rangex2_compress_seq_scp_feat(): @@ -1326,7 +1312,7 @@ def test_read_rangex2_compress_seq_scp_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 r = SDRF.create(feat_range_hco[k]) @@ -1339,12 +1325,16 @@ def test_read_rangex2_compress_seq_scp_feat(): data2.append(data_i[0]) i += 1 - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) - def test_read_compress_rangex2_random_file_feat(): @@ -1356,31 +1346,36 @@ def test_read_compress_rangex2_random_file_feat(): while not r.eof(): key_i, data_i = r.read(1) key1.append(key_i[0]) - data1.append(data_i[0][2*i:2*i+10]) + data1.append(data_i[0][2 * i : 2 * i + 10]) i += 1 r = RDRF.create(feat_range_hco[k]) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, row_offset=row_offset, num_rows=10) - for d1,d2 in zip(data1, data2): - assert_allclose(d1, d2, rtol=1e-5, atol=1e-4, - err_msg=('Read compression %s failed' % cm)) - + for d1, d2 in zip(data1, data2): + assert_allclose( + d1, + d2, + rtol=1e-5, + atol=1e-4, + err_msg=("Read compression %s failed" % cm), + ) # Vector files + def test_write_read_seq_file_vec(): r = SDRF.create(vec_scp_b, path_prefix=input_prefix) key1, data1 = r.read(0) - + # write w = DWF.create(vec_both_ho) w.write(key1, data1) w.close() - + r = SDRF.create(vec_h5_ho) key2 = [] data2 = [] @@ -1391,12 +1386,11 @@ def test_write_read_seq_file_vec(): f, loc = ismember(key1, key2) assert np.all(f) - for i, (k1, d1) in enumerate(zip(key1,data1)): + for i, (k1, d1) in enumerate(zip(key1, data1)): assert k1 == key2[loc[i]] assert_allclose(d1, data2[loc[i]]) - def test_read_seq_scp_vec(): # ark binary @@ -1412,12 +1406,11 @@ def test_read_seq_scp_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2, rtol=1e-4) - def test_read_random_file_vec(): r = SDRF.create(vec_h5_ho) @@ -1426,11 +1419,10 @@ def test_read_random_file_vec(): r = RDRF.create(vec_h5_ho) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_random_scp_vec(): r = SDRF.create(vec_scp_ho) @@ -1439,11 +1431,10 @@ def test_read_random_scp_vec(): r = RDRF.create(vec_scp_ho) data2 = r.read(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_shapes_seq_file_vec(): r = SDRF.create(vec_h5_ho) @@ -1451,28 +1442,26 @@ def test_read_shapes_seq_file_vec(): data1 = [] while not r.eof(): key_i, data_i = r.read(1) - if len(key_i)==0: + if len(key_i) == 0: break key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(vec_h5_ho) key2 = [] data2 = [] while not r.eof(): key_i, data_i = r.read_shapes(1) - if len(key_i)==0: + if len(key_i) == 0: break key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_shapes_seq_scp_vec(): r = SDRF.create(vec_scp_ho) @@ -1483,7 +1472,6 @@ def test_read_shapes_seq_scp_vec(): key1.append(key_i[0]) data1.append(data_i[0].shape) - r = SDRF.create(vec_scp_ho) key2 = [] data2 = [] @@ -1492,12 +1480,11 @@ def test_read_shapes_seq_scp_vec(): key2.append(key_i[0]) data2.append(data_i[0]) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert d1 == d2 - def test_read_shapes_random_file_vec(): r = SDRF.create(vec_h5_ho) @@ -1511,11 +1498,10 @@ def test_read_shapes_random_file_vec(): r = RDRF.create(vec_h5_ho) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_shapes_random_scp_vec(): r = SDRF.create(vec_scp_ho) @@ -1529,11 +1515,10 @@ def test_read_shapes_random_scp_vec(): r = RDRF.create(vec_scp_ho) data2 = r.read_shapes(key1) - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert d1 == d2 - def test_read_squeeze_random_scp_vec(): r = SDRF.create(vec_scp_ho) @@ -1545,18 +1530,17 @@ def test_read_squeeze_random_scp_vec(): key1.append(key_i[0]) data1.append(data_i[0]) i += 1 - + r = RDRF.create(vec_scp_ho) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, squeeze=True) assert isinstance(data2, np.ndarray) assert data2.ndim == 2 - for d1,d2 in zip(data1, data2): + for d1, d2 in zip(data1, data2): assert_allclose(d1, d2) - def test_read_squeeze_random_scp_vec_permissive(): r = SDRF.create(vec_scp_b, path_prefix=input_prefix) @@ -1568,20 +1552,19 @@ def test_read_squeeze_random_scp_vec_permissive(): key1.append(key_i[0]) data1.append(data_i[0]) i += 1 - - key1.append('unk') - r = RDRF.create('p,'+vec_scp_ho) + + key1.append("unk") + r = RDRF.create("p," + vec_scp_ho) row_offset = [i for i in range(len(key1))] data2 = r.read(key1, squeeze=True) assert isinstance(data2, np.ndarray) assert data2.ndim == 2 - for d1,d2 in zip(data1, data2[:-1]): + for d1, d2 in zip(data1, data2[:-1]): assert_allclose(d1, d2) assert_allclose(data2[-1], np.zeros(data2[0].shape)) - def test_write_squeeze_vec(): r = SDRF.create(vec_scp_ho) @@ -1598,14 +1581,14 @@ def test_write_squeeze_vec(): w = DWF.create(vec_both_hso) w.write(key1, data1s) w.close() - + r = SDRF.create(vec_scp_hso) key2, data2 = r.read(0) - for k1,k2,d1,d2 in zip(key1, key2, data1, data2): + for k1, k2, d1, d2 in zip(key1, key2, data1, data2): assert k1 == k2 assert_allclose(d1, d2) -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/io/test_packed_audio_rw.py b/tests/hyperion/io/test_packed_audio_rw.py index 5582aa2d..1d0d4844 100644 --- a/tests/hyperion/io/test_packed_audio_rw.py +++ b/tests/hyperion/io/test_packed_audio_rw.py @@ -9,64 +9,74 @@ from numpy.testing import assert_allclose from hyperion.hyp_defs import set_float_cpu, float_cpu -from hyperion.io import PackedAudioWriter as AW, SequentialPackedAudioReader as SAR, RandomAccessPackedAudioReader as RAR - -audio_path = './tests/data_out/io/packed_audio' -wav_scp_file = audio_path + '/wav.scp' -flac_scp_file = audio_path + '/flac.scp' -wav_file = audio_path + '/audio.wav' -flac_file = audio_path + '/audio.flac' -segments_file = audio_path + '/segments' +from hyperion.io import ( + PackedAudioWriter as AW, + SequentialPackedAudioReader as SAR, + RandomAccessPackedAudioReader as RAR, +) + +audio_path = "./tests/data_out/io/packed_audio" +wav_scp_file = audio_path + "/wav.scp" +flac_scp_file = audio_path + "/flac.scp" +wav_file = audio_path + "/audio.wav" +flac_file = audio_path + "/audio.flac" +segments_file = audio_path + "/segments" fs = 16000 + def gen_signals(num_signals=3): rng = np.random.RandomState(seed=1) s = [] keys = [] for i in range(num_signals): s_i = rng.randn(fs) - s_i = ((2**15-1)/np.max(np.abs(s_i))*s_i).astype('int16').astype(float_cpu()) + s_i = ( + ((2 ** 15 - 1) / np.max(np.abs(s_i)) * s_i) + .astype("int16") + .astype(float_cpu()) + ) s.append(s_i) - keys.append('s%d' % i) + keys.append("s%d" % i) return keys, s keys, s = gen_signals() + def gen_segments(num_signals=3, num_segs=2): if not os.path.exists(audio_path): os.makedirs(audio_path) - + keys_seg = [] s_seg = [] - with open(segments_file, 'w') as f: + with open(segments_file, "w") as f: for i in range(num_signals): - file_i = 's%d' % (i) + file_i = "s%d" % (i) for j in range(num_segs): - seg_ij = '%s-%d' % (file_i, j) - tbeg = j*0.1 - tend = (j+1)*0.1 - f.write('%s %s %.2f %.2f\n' % (seg_ij, file_i, tbeg, tend)) + seg_ij = "%s-%d" % (file_i, j) + tbeg = j * 0.1 + tend = (j + 1) * 0.1 + f.write("%s %s %.2f %.2f\n" % (seg_ij, file_i, tbeg, tend)) keys_seg.append(seg_ij) - s_seg.append(s[i][int(tbeg*fs):int(tend*fs)]) + s_seg.append(s[i][int(tbeg * fs) : int(tend * fs)]) return keys_seg, s_seg -keys_seg, s_seg = gen_segments() +keys_seg, s_seg = gen_segments() def test_write_audio_files_wav(): - with AW(wav_file, wav_scp_file, 'wav', fs=fs) as w: + with AW(wav_file, wav_scp_file, "wav", fs=fs) as w: w.write(keys, s) def test_write_audio_files_flac(): - with AW(flac_file, flac_scp_file, 'flac', fs=fs) as w: + with AW(flac_file, flac_scp_file, "flac", fs=fs) as w: w.write(keys, s) @@ -75,25 +85,25 @@ def test_read_sar_wav(): with SAR(wav_scp_file) as r: keys1, s1, fs1 = r.read() - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): assert_allclose(s_i, s1_i, atol=1) - - + + def test_read_sar_flac(): with SAR(flac_scp_file) as r: keys1, s1, fs1 = r.read() - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): assert_allclose(s_i, s1_i, atol=1) - + def test_read_sar_iter(): with SAR(wav_scp_file) as r: @@ -119,7 +129,7 @@ def test_read_sar_wav_with_segments(): for k_i, k1_i in zip(keys_seg, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s_seg, s1): assert_allclose(s_i, s1_i, atol=1) @@ -138,29 +148,29 @@ def test_read_sar_wav_intervals(): with SAR(wav_scp_file) as r: keys1, s1, fs1 = r.read(time_offset=0.2, time_durs=0.5) - n_start = int(0.2*fs) - n = int(0.5*fs) + n_start = int(0.2 * fs) + n = int(0.5 * fs) - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): - assert_allclose(s_i[n_start:n_start+n], s1_i, atol=1) - - + assert_allclose(s_i[n_start : n_start + n], s1_i, atol=1) + + def test_read_sar_flac_intervals(): with SAR(flac_scp_file) as r: keys1, s1, fs1 = r.read(time_offset=0.2, time_durs=0.5) - n_start = int(0.2*fs) - n = int(0.5*fs) + n_start = int(0.2 * fs) + n = int(0.5 * fs) - for k_i, k1_i in zip(keys,keys1): + for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s, s1): - assert_allclose(s_i[n_start:n_start+n], s1_i, atol=1) + assert_allclose(s_i[n_start : n_start + n], s1_i, atol=1) def test_read_rar_wav_intervals(): @@ -168,11 +178,11 @@ def test_read_rar_wav_intervals(): with RAR(wav_scp_file) as r: s1, fs1 = r.read(keys, time_offset=0.2, time_durs=0.5) - n_start = int(0.2*fs) - n = int(0.5*fs) + n_start = int(0.2 * fs) + n = int(0.5 * fs) for s_i, s1_i in zip(s, s1): - assert_allclose(s_i[n_start:n_start+n], s1_i, atol=1) + assert_allclose(s_i[n_start : n_start + n], s1_i, atol=1) def test_read_rar_flac_intervals(): @@ -180,11 +190,11 @@ def test_read_rar_flac_intervals(): with RAR(flac_scp_file) as r: s1, fs1 = r.read(keys, time_offset=0.2, time_durs=0.5) - n_start = int(0.2*fs) - n = int(0.5*fs) + n_start = int(0.2 * fs) + n = int(0.5 * fs) for s_i, s1_i in zip(s, s1): - assert_allclose(s_i[n_start:n_start+n], s1_i, atol=1) + assert_allclose(s_i[n_start : n_start + n], s1_i, atol=1) def test_read_sar_wav_with_segments_and_intervals(): @@ -192,14 +202,14 @@ def test_read_sar_wav_with_segments_and_intervals(): with SAR(wav_scp_file, segments_file) as r: keys1, s1, fs1 = r.read(time_offset=0.02, time_durs=0.05) - n_start = int(0.02*fs) - n = int(0.05*fs) + n_start = int(0.02 * fs) + n = int(0.05 * fs) for k_i, k1_i in zip(keys_seg, keys1): assert k_i == k1_i - + for s_i, s1_i in zip(s_seg, s1): - assert_allclose(s_i[n_start:n_start+n], s1_i, atol=1) + assert_allclose(s_i[n_start : n_start + n], s1_i, atol=1) def test_read_rar_with_segments_and_intervals(): @@ -207,28 +217,27 @@ def test_read_rar_with_segments_and_intervals(): with RAR(flac_scp_file, segments_file) as r: s1, fs1 = r.read(keys_seg, time_offset=0.02, time_durs=0.05) - n_start = int(0.02*fs) - n = int(0.05*fs) + n_start = int(0.02 * fs) + n = int(0.05 * fs) for s_i, s1_i in zip(s_seg, s1): - assert_allclose(s_i[n_start:n_start+n], s1_i, atol=1) - + assert_allclose(s_i[n_start : n_start + n], s1_i, atol=1) def test_read_sar_num_samples(): - + with SAR(wav_scp_file) as r: keys1, ns1 = r.read_num_samples() for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, ns1_i in zip(s, ns1): assert_allclose(len(s_i), ns1_i) def test_read_rar_num_samples(): - + with RAR(wav_scp_file) as r: ns1 = r.read_num_samples(keys) @@ -237,19 +246,19 @@ def test_read_rar_num_samples(): def test_read_sar_num_samples_segments(): - + with SAR(wav_scp_file, segments_file) as r: keys1, ns1 = r.read_num_samples() for k_i, k1_i in zip(keys_seg, keys1): assert k_i == k1_i - + for s_i, ns1_i in zip(s_seg, ns1): assert_allclose(len(s_i), ns1_i) def test_read_rar_num_samples_segments(): - + with RAR(wav_scp_file, segments_file) as r: ns1 = r.read_num_samples(keys_seg) @@ -258,19 +267,19 @@ def test_read_rar_num_samples_segments(): def test_read_sar_time_duration(): - + with SAR(wav_scp_file) as r: keys1, ts1 = r.read_time_duration() for k_i, k1_i in zip(keys, keys1): assert k_i == k1_i - + for s_i, ts1_i in zip(s, ts1): - assert_allclose(1., ts1_i) + assert_allclose(1.0, ts1_i) def test_read_rar_time_duration(): - + with RAR(wav_scp_file) as r: ts1 = r.read_time_duration(keys) @@ -279,19 +288,19 @@ def test_read_rar_time_duration(): def test_read_sar_time_duration_segments(): - + with SAR(wav_scp_file, segments_file) as r: keys1, ts1 = r.read_time_duration() for k_i, k1_i in zip(keys_seg, keys1): assert k_i == k1_i - + for s_i, ts1_i in zip(s_seg, ts1): assert_allclose(0.1, ts1_i) def test_read_rar_time_duration_segments(): - + with RAR(wav_scp_file, segments_file) as r: ts1 = r.read_time_duration(keys_seg) diff --git a/tests/hyperion/io/test_rw_specifiers.py b/tests/hyperion/io/test_rw_specifiers.py index ab9abbe8..13767c18 100644 --- a/tests/hyperion/io/test_rw_specifiers.py +++ b/tests/hyperion/io/test_rw_specifiers.py @@ -8,95 +8,93 @@ from hyperion.io.rw_specifiers import * -output_dir = './tests/data_out/io' +output_dir = "./tests/data_out/io" if not os.path.exists(output_dir): os.makedirs(output_dir) + def test_rspecifier(): - rs1 = RSpecifier(RSpecType.ARCHIVE, 'file.h5', ArchiveType.H5) - rs2 = RSpecifier.create('file.h5') + rs1 = RSpecifier(RSpecType.ARCHIVE, "file.h5", ArchiveType.H5) + rs2 = RSpecifier.create("file.h5") assert rs1 == rs2 - rs2 = RSpecifier.create('h5:file.h5') + rs2 = RSpecifier.create("h5:file.h5") assert rs1 == rs2 - rs1 = RSpecifier(RSpecType.ARCHIVE, 'file.ark', ArchiveType.ARK) - rs2 = RSpecifier.create('ark:file.ark') + rs1 = RSpecifier(RSpecType.ARCHIVE, "file.ark", ArchiveType.ARK) + rs2 = RSpecifier.create("ark:file.ark") assert rs1 == rs2 - rs1 = RSpecifier(RSpecType.ARCHIVE, 'file.ark', ArchiveType.ARK, - True, True, True, True, True) - rs2 = RSpecifier.create('ark,o,s,cs,p,bg:file.ark') + rs1 = RSpecifier( + RSpecType.ARCHIVE, "file.ark", ArchiveType.ARK, True, True, True, True, True + ) + rs2 = RSpecifier.create("ark,o,s,cs,p,bg:file.ark") assert rs1 == rs2 - file_path = output_dir + '/file.scp' - with open(file_path, 'w') as f: - f.write('key file1.ark:0\n') + file_path = output_dir + "/file.scp" + with open(file_path, "w") as f: + f.write("key file1.ark:0\n") rs1 = RSpecifier(RSpecType.SCRIPT, file_path, ArchiveType.ARK) - rs2 = RSpecifier.create('scp:' + file_path) + rs2 = RSpecifier.create("scp:" + file_path) assert rs1 == rs2 - with open(file_path, 'w') as f: - f.write('key file1.h5\n') + with open(file_path, "w") as f: + f.write("key file1.h5\n") rs1 = RSpecifier(RSpecType.SCRIPT, file_path, ArchiveType.H5) - rs2 = RSpecifier.create('scp:' + file_path) + rs2 = RSpecifier.create("scp:" + file_path) assert rs1 == rs2 - - with open(file_path, 'w') as f: - f.write('key file1.flac:0[0:10]\n') + with open(file_path, "w") as f: + f.write("key file1.flac:0[0:10]\n") rs1 = RSpecifier(RSpecType.SCRIPT, file_path, ArchiveType.AUDIO) - rs2 = RSpecifier.create('scp:' + file_path) + rs2 = RSpecifier.create("scp:" + file_path) assert rs1 == rs2 def test_wspecifier(): - rs1 = WSpecifier(WSpecType.ARCHIVE, 'file.h5', None, ArchiveType.H5) - rs2 = WSpecifier.create('file.h5') + rs1 = WSpecifier(WSpecType.ARCHIVE, "file.h5", None, ArchiveType.H5) + rs2 = WSpecifier.create("file.h5") assert rs1 == rs2 - rs2 = WSpecifier.create('h5:file.h5') + rs2 = WSpecifier.create("h5:file.h5") assert rs1 == rs2 - rs1 = WSpecifier(WSpecType.ARCHIVE, 'file.ark', None, ArchiveType.ARK) - rs2 = WSpecifier.create('ark:file.ark') + rs1 = WSpecifier(WSpecType.ARCHIVE, "file.ark", None, ArchiveType.ARK) + rs2 = WSpecifier.create("ark:file.ark") assert rs1 == rs2 - rs1 = WSpecifier(WSpecType.ARCHIVE, 'file.ark', None, ArchiveType.ARK) - rs2 = WSpecifier.create('ark,b,nf:file.ark') + rs1 = WSpecifier(WSpecType.ARCHIVE, "file.ark", None, ArchiveType.ARK) + rs2 = WSpecifier.create("ark,b,nf:file.ark") assert rs1 == rs2 - rs1 = WSpecifier(WSpecType.ARCHIVE, 'file.ark', None, ArchiveType.ARK) - rs2 = WSpecifier.create('ark,b,nf:file.ark') + rs1 = WSpecifier(WSpecType.ARCHIVE, "file.ark", None, ArchiveType.ARK) + rs2 = WSpecifier.create("ark,b,nf:file.ark") assert rs1 == rs2 - - rs1 = WSpecifier(WSpecType.ARCHIVE, 'file.ark', None, ArchiveType.ARK, - False, True, True) - rs2 = WSpecifier.create('ark,t,f,p:file.ark') + rs1 = WSpecifier( + WSpecType.ARCHIVE, "file.ark", None, ArchiveType.ARK, False, True, True + ) + rs2 = WSpecifier.create("ark,t,f,p:file.ark") assert rs1 == rs2 - - rs1 = WSpecifier(WSpecType.SCRIPT, None, 'file.scp', None) - rs2 = WSpecifier.create('scp:file.scp') + + rs1 = WSpecifier(WSpecType.SCRIPT, None, "file.scp", None) + rs2 = WSpecifier.create("scp:file.scp") assert rs1 == rs2 - rs1 = WSpecifier(WSpecType.BOTH, 'file.ark', 'file.scp', - ArchiveType.ARK,False) - rs2 = WSpecifier.create('ark,t,scp:file.ark,file.scp') + rs1 = WSpecifier(WSpecType.BOTH, "file.ark", "file.scp", ArchiveType.ARK, False) + rs2 = WSpecifier.create("ark,t,scp:file.ark,file.scp") assert rs1 == rs2 - rs1 = WSpecifier(WSpecType.BOTH, 'file.h5', 'file.scp', - ArchiveType.H5) - rs2 = WSpecifier.create('h5,scp:file.h5,file.scp') + rs1 = WSpecifier(WSpecType.BOTH, "file.h5", "file.scp", ArchiveType.H5) + rs2 = WSpecifier.create("h5,scp:file.h5,file.scp") assert rs1 == rs2 - rs1 = WSpecifier(WSpecType.BOTH, 'file.flac', 'file.scp', - ArchiveType.AUDIO) - rs2 = WSpecifier.create('audio,scp:file.flac,file.scp') + rs1 = WSpecifier(WSpecType.BOTH, "file.flac", "file.scp", ArchiveType.AUDIO) + rs2 = WSpecifier.create("audio,scp:file.flac,file.scp") assert rs1 == rs2 -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/metrics/test_acc.py b/tests/hyperion/metrics/test_acc.py index 19db0221..b3a55a21 100644 --- a/tests/hyperion/metrics/test_acc.py +++ b/tests/hyperion/metrics/test_acc.py @@ -10,13 +10,12 @@ from hyperion.metrics.acc import compute_accuracy + def test_compute_accuracy(): - y_true = np.arange(10, dtype='int32') - y_pred = np.arange(10, dtype='int32') + y_true = np.arange(10, dtype="int32") + y_pred = np.arange(10, dtype="int32") y_pred[:3] = 5 acc = compute_accuracy(y_true, y_pred) assert acc == 0.7 - - diff --git a/tests/hyperion/metrics/test_cllr.py b/tests/hyperion/metrics/test_cllr.py index b4d7f58c..535896de 100644 --- a/tests/hyperion/metrics/test_cllr.py +++ b/tests/hyperion/metrics/test_cllr.py @@ -20,8 +20,6 @@ def test_cllr(): def test_min_cllr(): - s = 10*np.ones((100,)) + s = 10 * np.ones((100,)) c = compute_min_cllr(s, s) - assert c <= 1 and c >0.99 - - + assert c <= 1 and c > 0.99 diff --git a/tests/hyperion/metrics/test_confusion_matrix.py b/tests/hyperion/metrics/test_confusion_matrix.py index 6873e993..98fe874a 100644 --- a/tests/hyperion/metrics/test_confusion_matrix.py +++ b/tests/hyperion/metrics/test_confusion_matrix.py @@ -28,10 +28,6 @@ def test_xlabel_confusion_matrix(): y_pred = np.array([1, 1, 0, 0, 0, 1, 1, 1, 0, 0]) C = compute_xlabel_confusion_matrix(y_true, y_pred) - C_true = np.array([[3/5, 2/5], [2/3, 1/3], [0, 1]]) + C_true = np.array([[3 / 5, 2 / 5], [2 / 3, 1 / 3], [0, 1]]) assert_allclose(C, C_true) - - - - diff --git a/tests/hyperion/metrics/test_dcf.py b/tests/hyperion/metrics/test_dcf.py index 1bcf6c81..a3064b7c 100644 --- a/tests/hyperion/metrics/test_dcf.py +++ b/tests/hyperion/metrics/test_dcf.py @@ -23,58 +23,57 @@ def test_dcf(): p_tar = [0.1, 0.5] dcf = compute_dcf(p_miss, p_fa, p_tar) - - dcf_ref = np.array([ - [0.01+9*0.03, 0.02+9*0.02, 0.03+9*0.01], - [0.04, 0.04, 0.04]]) + + dcf_ref = np.array( + [[0.01 + 9 * 0.03, 0.02 + 9 * 0.02, 0.03 + 9 * 0.01], [0.04, 0.04, 0.04]] + ) assert_allclose(dcf_ref, dcf) - + def test_min_dcf(): - tar = np.linspace(-2,10,1000)+3 - non = np.linspace(-10,2,1000)+3 + tar = np.linspace(-2, 10, 1000) + 3 + non = np.linspace(-10, 2, 1000) + 3 - p=0.5 + p = 0.5 dcf, _, _ = compute_min_dcf(tar, non, p) assert dcf > 0.332 and dcf < 0.334 p = [0.1, 0.5, 0.9] dcf, _, _ = compute_min_dcf(tar, non, p) assert dcf[1] > 0.332 and dcf[1] < 0.334 - def test_act_dcf(): - tar = np.linspace(-2,10,1000) - non = np.linspace(-10,2,1000) + tar = np.linspace(-2, 10, 1000) + non = np.linspace(-10, 2, 1000) - p=0.5 + p = 0.5 dcf, _, _ = compute_act_dcf(tar, non, p) - assert dcf == 2*167/1000 + assert dcf == 2 * 167 / 1000 p = [0.1, 0.5, 0.9] dcf, _, _ = compute_act_dcf(tar, non, p) print(dcf) - assert dcf[1] == 2*167/1000 + assert dcf[1] == 2 * 167 / 1000 def test_fast_eval(): - tar = np.linspace(-2,10,1000) - non = np.linspace(-10,2,1000) + tar = np.linspace(-2, 10, 1000) + non = np.linspace(-10, 2, 1000) - p=0.5 + p = 0.5 min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, p) assert min_dcf > 0.332 and min_dcf < 0.334 - assert act_dcf == 2*167/1000 + assert act_dcf == 2 * 167 / 1000 assert eer > 0.166 and eer < 0.167 p = [0.1, 0.5, 0.9] min_dcf, act_dcf, eer, _ = fast_eval_dcf_eer(tar, non, p) - + assert min_dcf[1] > 0.332 and min_dcf[1] < 0.334 - assert act_dcf[1] == 2*167/1000 + assert act_dcf[1] == 2 * 167 / 1000 assert eer > 0.166 and eer < 0.167 diff --git a/tests/hyperion/metrics/test_eer.py b/tests/hyperion/metrics/test_eer.py index f31d00bf..64ec3416 100644 --- a/tests/hyperion/metrics/test_eer.py +++ b/tests/hyperion/metrics/test_eer.py @@ -13,8 +13,8 @@ def test_eer(): - tar = np.linspace(-2,10,1000) - non = np.linspace(-10,2,1000) + tar = np.linspace(-2, 10, 1000) + non = np.linspace(-10, 2, 1000) eer = compute_eer(tar, non) assert eer > 0.166 and eer < 0.167 @@ -22,10 +22,8 @@ def test_eer(): def test_prbep(): - tar = np.linspace(-2,10,1200) - non = np.linspace(-10,2,1200) + tar = np.linspace(-2, 10, 1200) + non = np.linspace(-10, 2, 1200) p = compute_prbep(tar, non) assert p == 200 - - diff --git a/tests/hyperion/metrics/test_roc.py b/tests/hyperion/metrics/test_roc.py index ac2f9d41..6bcb9e65 100644 --- a/tests/hyperion/metrics/test_roc.py +++ b/tests/hyperion/metrics/test_roc.py @@ -9,135 +9,137 @@ from numpy.testing import assert_allclose import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.metrics.roc import * -output_dir = './tests/data_out/metrics' +output_dir = "./tests/data_out/metrics" if not os.path.exists(output_dir): os.makedirs(output_dir) + def test_roc(): plt.figure() - plt.subplot(2,3,1) + plt.subplot(2, 3, 1) tar = np.array([1]) non = np.array([0]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - h1, = plt.plot(pfa,pmiss,'r-^', label='ROCCH',linewidth=2) - h2, = plt.plot(pf,pm,'g--v', label='ROC',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + (h1,) = plt.plot(pfa, pmiss, "r-^", label="ROCCH", linewidth=2) + (h2,) = plt.plot(pf, pm, "g--v", label="ROC", linewidth=2) + plt.axis("square") plt.grid(True) plt.legend(handles=[h1, h2]) - plt.title('2 scores: non < tar') + plt.title("2 scores: non < tar") print(pmiss, pfa) print(pm, pf) - assert_allclose(pmiss,[ 0.,0.,1.]) - assert_allclose(pfa,[1.,0.,0.]) - assert_allclose(pm,[0.,0.,1.]) - assert_allclose(pf,[1.,0.,0.]) + assert_allclose(pmiss, [0.0, 0.0, 1.0]) + assert_allclose(pfa, [1.0, 0.0, 0.0]) + assert_allclose(pm, [0.0, 0.0, 1.0]) + assert_allclose(pf, [1.0, 0.0, 0.0]) - - plt.subplot(2,3,2) + plt.subplot(2, 3, 2) tar = np.array([0]) non = np.array([1]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g--v',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g--v", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('2 scores: tar < non') + plt.title("2 scores: tar < non") print(pmiss, pfa) print(pm, pf) - assert_allclose(pmiss,[0.,1.]) - assert_allclose(pfa,[1.,0.]) - assert_allclose(pm,[0.,1.,1.]) - assert_allclose(pf,[1.,1.,0.]) + assert_allclose(pmiss, [0.0, 1.0]) + assert_allclose(pfa, [1.0, 0.0]) + assert_allclose(pm, [0.0, 1.0, 1.0]) + assert_allclose(pf, [1.0, 1.0, 0.0]) - - plt.subplot(2,3,3) + plt.subplot(2, 3, 3) tar = np.array([0]) - non = np.array([-1,1]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g--v',linewidth=2) - plt.axis('square') + non = np.array([-1, 1]) + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g--v", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('3 scores: non < tar < non') + plt.title("3 scores: non < tar < non") print(pmiss, pfa) print(pm, pf) - assert_allclose(pmiss,[0.,0.,1.]) - assert_allclose(pfa,[1.,0.5,0.]) - assert_allclose(pm,[0.,0.,1.,1.]) - assert_allclose(pf,[1.,0.5,0.5,0.]) - - - plt.subplot(2,3,4) - tar = np.array([-1,1]) + assert_allclose(pmiss, [0.0, 0.0, 1.0]) + assert_allclose(pfa, [1.0, 0.5, 0.0]) + assert_allclose(pm, [0.0, 0.0, 1.0, 1.0]) + assert_allclose(pf, [1.0, 0.5, 0.5, 0.0]) + + plt.subplot(2, 3, 4) + tar = np.array([-1, 1]) non = np.array([0]) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g--v',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g--v", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('3 scores: tar < non < tar') - plt.xlabel(r'$P_{fa}$') - plt.ylabel(r'$P_{miss}$') + plt.title("3 scores: tar < non < tar") + plt.xlabel(r"$P_{fa}$") + plt.ylabel(r"$P_{miss}$") print(pmiss, pfa) print(pm, pf) - assert_allclose(pmiss, [0.,0.5,1.]) - assert_allclose(pfa, [1.,0.,0.]) - assert_allclose(pm, [0.,0.5,0.5,1.]) - assert_allclose(pf, [1.,1.,0.,0.]) + assert_allclose(pmiss, [0.0, 0.5, 1.0]) + assert_allclose(pfa, [1.0, 0.0, 0.0]) + assert_allclose(pm, [0.0, 0.5, 0.5, 1.0]) + assert_allclose(pf, [1.0, 1.0, 0.0, 0.0]) - - plt.subplot(2,3,5) + plt.subplot(2, 3, 5) rng = np.random.RandomState(100) - tar = rng.randn(100)+1 + tar = rng.randn(100) + 1 non = rng.randn(100) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('DET') + plt.title("DET") print(pmiss, pfa) print(pm[:10], pf[:10]) - assert_allclose(pmiss, [0.,0.,0.01,0.16,0.22,0.29,0.45,0.5,0.89,0.92,1.]) - assert_allclose(pfa, [1.,0.91,0.77,0.48,0.4,0.33,0.19,0.15,0.01,0.,0.]) - assert_allclose(pm[:10], [0.,0.,0.,0.,0.,0.,0.,0.,0.,0.]) - assert_allclose(pf[:10], [1.,0.99,0.98,0.97,0.96,0.95,0.94,0.93,0.92,0.91]) - - - plt.subplot(2,3,6) - tar = rng.randn(100)*2+1 + assert_allclose( + pmiss, [0.0, 0.0, 0.01, 0.16, 0.22, 0.29, 0.45, 0.5, 0.89, 0.92, 1.0] + ) + assert_allclose(pfa, [1.0, 0.91, 0.77, 0.48, 0.4, 0.33, 0.19, 0.15, 0.01, 0.0, 0.0]) + assert_allclose(pm[:10], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]) + assert_allclose( + pf[:10], [1.0, 0.99, 0.98, 0.97, 0.96, 0.95, 0.94, 0.93, 0.92, 0.91] + ) + + plt.subplot(2, 3, 6) + tar = rng.randn(100) * 2 + 1 non = rng.randn(100) - pmiss, pfa = compute_rocch(tar,non) - pm, pf = compute_roc(tar,non) - plt.plot(pfa,pmiss,'r-^',pf,pm,'g',linewidth=2) - plt.axis('square') + pmiss, pfa = compute_rocch(tar, non) + pm, pf = compute_roc(tar, non) + plt.plot(pfa, pmiss, "r-^", pf, pm, "g", linewidth=2) + plt.axis("square") plt.grid(True) - plt.title('flatter DET') + plt.title("flatter DET") print(pmiss, pfa) print(pm[:10], pf[:10]) - assert_allclose(pmiss,[0.,0.31,0.48,0.5,0.58,0.62,0.85,1.]) - assert_allclose(pfa,[1.,0.41,0.15,0.12,0.06,0.05,0.,0.]) - assert_allclose(pm[:10], [0.,0.01,0.02,0.03,0.04,0.05,0.05,0.05,0.05,0.05]) - assert_allclose(pf[:10], [1.,1.,1.,1.,1.,1.,0.99,0.98,0.97, 0.96]) + assert_allclose(pmiss, [0.0, 0.31, 0.48, 0.5, 0.58, 0.62, 0.85, 1.0]) + assert_allclose(pfa, [1.0, 0.41, 0.15, 0.12, 0.06, 0.05, 0.0, 0.0]) + assert_allclose( + pm[:10], [0.0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.05, 0.05, 0.05, 0.05] + ) + assert_allclose(pf[:10], [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.99, 0.98, 0.97, 0.96]) # plt.show() - plt.savefig(output_dir + '/roc.pdf') + plt.savefig(output_dir + "/roc.pdf") plt.close() - def test_rocch2eer(): rng = np.random.RandomState(100) - tar = rng.randn(100)+1 + tar = rng.randn(100) + 1 non = rng.randn(100) - pmiss, pfa = compute_rocch(tar,non) + pmiss, pfa = compute_rocch(tar, non) eer = rocch2eer(pmiss, pfa) diff --git a/tests/hyperion/pdfs/core/test_normal.py b/tests/hyperion/pdfs/core/test_normal.py index eadc7959..52b6a25c 100644 --- a/tests/hyperion/pdfs/core/test_normal.py +++ b/tests/hyperion/pdfs/core/test_normal.py @@ -8,7 +8,8 @@ import numpy as np from scipy import linalg as la import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from numpy.testing import assert_allclose @@ -16,7 +17,7 @@ from hyperion.utils.math import symmat2vec from hyperion.pdfs import NormalDiagCov, Normal -output_dir = './tests/data_out/pdfs/core/normal' +output_dir = "./tests/data_out/pdfs/core/normal" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -24,7 +25,7 @@ num_samples = 1000 batch_size = 250 num_samples_train = 1000000 -model_file = output_dir + '/model.h5' +model_file = output_dir + "/model.h5" def create_diag_pdf(): @@ -38,7 +39,6 @@ def create_diag_pdf(): return model, model_diag - def create_pdf(): rng = np.random.RandomState(seed=0) @@ -48,9 +48,8 @@ def create_pdf(): Lambda = np.dot(U, U.T) model = Normal(mu=mu, Lambda=Lambda, x_dim=x_dim) return model - - - + + def test_diag_properties(): model, model_diag = create_diag_pdf() @@ -59,18 +58,18 @@ def test_diag_properties(): assert_allclose(model.logLambda, np.sum(np.log(model_diag.Lambda))) - def test_properties(): model = create_pdf() assert_allclose(model.Sigma, la.inv(model.Lambda)) assert_allclose(model.cholLambda, la.cholesky(model.Lambda, lower=True)) - assert_allclose(model.logLambda, 2*np.sum(np.log(np.diag(la.cholesky(model.Lambda))))) - + assert_allclose( + model.logLambda, 2 * np.sum(np.log(np.diag(la.cholesky(model.Lambda)))) + ) + - def test_diag_initialize(): - + model1, model1_diag = create_diag_pdf() model1.initialize() model1_diag.initialize() @@ -90,7 +89,6 @@ def test_diag_initialize(): assert_allclose(model2.Lambda, np.diag(model1_diag.Lambda)) - def test_initialize(): model1 = create_pdf() @@ -99,8 +97,7 @@ def test_initialize(): model2 = Normal(eta=model1.eta, x_dim=model1.x_dim) model2.initialize() - model3 = Normal(mu=model2.mu, Lambda=model2.Lambda, - x_dim=model1.x_dim) + model3 = Normal(mu=model2.mu, Lambda=model2.Lambda, x_dim=model1.x_dim) model3.initialize() assert_allclose(model1.eta, model2.eta) @@ -116,23 +113,21 @@ def test_initialize(): assert_allclose(model1.Lambda, model3.Lambda) - def test_log_h(): model1 = create_pdf() - sample_weight = np.arange(1,num_samples+1, dtype=float)/num_samples - - assert(model1.log_h(None) == 0) - assert(model1.accum_log_h(None, sample_weight=sample_weight) == 0) + sample_weight = np.arange(1, num_samples + 1, dtype=float) / num_samples + + assert model1.log_h(None) == 0 + assert model1.accum_log_h(None, sample_weight=sample_weight) == 0 - def test_suff_stats(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) + sample_weight = 0.5 * np.ones((num_samples,)) xx = [] for i in range(x.shape[0]): @@ -145,10 +140,15 @@ def test_suff_stats(): N2, u_x2 = model1.accum_suff_stats(x, batch_size=batch_size) assert_allclose(model1.accum_suff_stats(x, batch_size=batch_size)[1], u_x) - assert_allclose(model1.accum_suff_stats(x, sample_weight=sample_weight)[1], 0.5*u_x) - assert_allclose(model1.accum_suff_stats(x, sample_weight=sample_weight, - batch_size=batch_size)[1], 0.5*u_x) - + assert_allclose( + model1.accum_suff_stats(x, sample_weight=sample_weight)[1], 0.5 * u_x + ) + assert_allclose( + model1.accum_suff_stats(x, sample_weight=sample_weight, batch_size=batch_size)[ + 1 + ], + 0.5 * u_x, + ) def test_diag_log_prob(): @@ -156,12 +156,13 @@ def test_diag_log_prob(): model1, model1_diag = create_diag_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, method='nat'), - model1_diag.log_prob(x, method='std')) - assert_allclose(model1.log_prob(x, method='std'), - model1_diag.log_prob(x, method='std')) + assert_allclose( + model1.log_prob(x, method="nat"), model1_diag.log_prob(x, method="std") + ) + assert_allclose( + model1.log_prob(x, method="std"), model1_diag.log_prob(x, method="std") + ) def test_log_prob(): @@ -169,14 +170,13 @@ def test_log_prob(): model1 = create_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, method='nat'), - model1.log_prob(x, method='std')) + + assert_allclose(model1.log_prob(x, method="nat"), model1.log_prob(x, method="std")) u_x = model1.compute_suff_stats(x) - assert_allclose(model1.log_prob(x, u_x, method='nat'), - model1.log_prob(x, method='std')) - + assert_allclose( + model1.log_prob(x, u_x, method="nat"), model1.log_prob(x, method="std") + ) def test_diag_elbo(): @@ -184,28 +184,28 @@ def test_diag_elbo(): model1, model1_diag = create_diag_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + assert_allclose(model1.elbo(x), model1_diag.elbo(x)) - assert_allclose(model1.elbo(x, sample_weight=sample_weight), - 0.5*model1_diag.elbo(x)) + assert_allclose( + model1.elbo(x, sample_weight=sample_weight), 0.5 * model1_diag.elbo(x) + ) + - - def test_elbo(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - assert_allclose(model1.elbo(x), - np.sum(model1.log_prob(x, method='std'))) - assert_allclose(model1.elbo(x, sample_weight=sample_weight), - 0.5*np.sum(model1.log_prob(x, method='std'))) - - - + sample_weight = 0.5 * np.ones((num_samples,)) + + assert_allclose(model1.elbo(x), np.sum(model1.log_prob(x, method="std"))) + assert_allclose( + model1.elbo(x, sample_weight=sample_weight), + 0.5 * np.sum(model1.log_prob(x, method="std")), + ) + + # def test_eval_logcdf(): # model1 = create_pdf() @@ -214,8 +214,7 @@ def test_elbo(): # assert(model1.eval_logcdf(model1.mu) == x_dim*np.log(0.5)) # assert(model1.eval_logcdf(1e10*np.ones((x_dim,))) > np.log(0.99)) # assert(model1.eval_logcdf(-1e10*np.ones((x_dim,))) < np.log(0.01)) - - + def test_diag_fit(): @@ -229,7 +228,7 @@ def test_diag_fit(): model2_diag = NormalDiagCov(x_dim=x_dim) elbo_diag = model2_diag.fit(x, x_val=x_val) - + assert_allclose(model2.mu, model2_diag.mu, atol=0.01) assert_allclose(np.diag(model2.Lambda), model2_diag.Lambda, atol=0.01) assert_allclose(model2.A, model2_diag.A, atol=0.02) @@ -237,7 +236,6 @@ def test_diag_fit(): assert_allclose(elbo[3], elbo_diag[3], rtol=1e-4) - def test_fit(): model1 = create_pdf() @@ -249,8 +247,10 @@ def test_fit(): elbo = model2.fit(x, x_val=x_val) assert_allclose(model2.mu, np.mean(x, axis=0)) - assert_allclose(model2.Lambda, la.inv(np.dot(x.T, x)/num_samples_train - -np.outer(model2.mu, model2.mu))) + assert_allclose( + model2.Lambda, + la.inv(np.dot(x.T, x) / num_samples_train - np.outer(model2.mu, model2.mu)), + ) assert_allclose(model1.mu, model2.mu, atol=0.02) assert_allclose(model1.Lambda, model2.Lambda, atol=0.2) assert_allclose(model1.eta, model2.eta, atol=0.05) @@ -261,30 +261,26 @@ def test_fit(): assert_allclose(elbo[3], np.mean(model2.log_prob(x_val)), rtol=1e-4) - def test_plot(): - + model1 = create_pdf() model1.plot1D() - plt.savefig(output_dir + '/normal_1D.pdf') + plt.savefig(output_dir + "/normal_1D.pdf") plt.close() model1.plot2D() - plt.savefig(output_dir + '/normal_2D.pdf') + plt.savefig(output_dir + "/normal_2D.pdf") plt.close() model1.plot3D() - plt.savefig(output_dir + '/normal_3D.pdf') + plt.savefig(output_dir + "/normal_3D.pdf") plt.close() model1.plot3D_ellipsoid() - plt.savefig(output_dir + '/normal_3De.pdf') + plt.savefig(output_dir + "/normal_3De.pdf") plt.close() - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) - - diff --git a/tests/hyperion/pdfs/core/test_normal_diag_cov.py b/tests/hyperion/pdfs/core/test_normal_diag_cov.py index 4df9c822..e38c3eb1 100644 --- a/tests/hyperion/pdfs/core/test_normal_diag_cov.py +++ b/tests/hyperion/pdfs/core/test_normal_diag_cov.py @@ -7,13 +7,14 @@ import os import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.pdfs import NormalDiagCov from numpy.testing import assert_allclose -output_dir = './tests/data_out/pdfs/core/normal_diag_cov' +output_dir = "./tests/data_out/pdfs/core/normal_diag_cov" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -21,7 +22,7 @@ num_samples = 1000 batch_size = 250 num_samples_train = 100000 -model_file = output_dir + '/model.h5' +model_file = output_dir + "/model.h5" def create_pdf(): @@ -33,16 +34,15 @@ def create_pdf(): model = NormalDiagCov(mu=mu, Lambda=Lambda, x_dim=x_dim) return model - + def test_properties(): model = create_pdf() - assert(np.all(model.Sigma == 1/model.Lambda)) - assert(np.all(model.cholLambda == np.sqrt(model.Lambda))) - assert(np.all(model.logLambda == np.sum(np.log(model.Lambda)))) + assert np.all(model.Sigma == 1 / model.Lambda) + assert np.all(model.cholLambda == np.sqrt(model.Lambda)) + assert np.all(model.logLambda == np.sum(np.log(model.Lambda))) - def test_initialize(): model1 = create_pdf() @@ -51,9 +51,7 @@ def test_initialize(): model2 = NormalDiagCov(eta=model1.eta, x_dim=model1.x_dim) model2.initialize() - model3 = NormalDiagCov(mu=model2.mu, - Lambda=model2.Lambda, - x_dim=model1.x_dim) + model3 = NormalDiagCov(mu=model2.mu, Lambda=model2.Lambda, x_dim=model1.x_dim) model3.initialize() print(model3.eta) @@ -62,7 +60,7 @@ def test_initialize(): print(model1.A) print(model2.A) print(model3.A) - + assert_allclose(model1.eta, model2.eta) assert_allclose(model1.eta, model3.eta) @@ -79,10 +77,10 @@ def test_initialize(): def test_log_h(): model1 = create_pdf() - sample_weight = np.arange(1,num_samples+1, dtype=float)/num_samples - - assert(model1.log_h(None) == 0) - assert(model1.accum_log_h(None, sample_weight=sample_weight) == 0) + sample_weight = np.arange(1, num_samples + 1, dtype=float) / num_samples + + assert model1.log_h(None) == 0 + assert model1.accum_log_h(None, sample_weight=sample_weight) == 0 def test_suff_stats(): @@ -90,18 +88,24 @@ def test_suff_stats(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - u_x = np.hstack((x, x*x)) + sample_weight = 0.5 * np.ones((num_samples,)) + + u_x = np.hstack((x, x * x)) assert_allclose(model1.compute_suff_stats(x), u_x) N, u_x = model1.accum_suff_stats(x) N2, u_x2 = model1.accum_suff_stats(x, batch_size=batch_size) assert_allclose(model1.accum_suff_stats(x, batch_size=batch_size)[1], u_x) - assert_allclose(model1.accum_suff_stats(x, sample_weight=sample_weight)[1], 0.5*u_x) - assert_allclose(model1.accum_suff_stats(x, sample_weight=sample_weight, batch_size=batch_size)[1], 0.5*u_x) - + assert_allclose( + model1.accum_suff_stats(x, sample_weight=sample_weight)[1], 0.5 * u_x + ) + assert_allclose( + model1.accum_suff_stats(x, sample_weight=sample_weight, batch_size=batch_size)[ + 1 + ], + 0.5 * u_x, + ) def test_log_prob(): @@ -109,37 +113,37 @@ def test_log_prob(): model1 = create_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, method='nat'), - model1.log_prob(x, method='std')) + + assert_allclose(model1.log_prob(x, method="nat"), model1.log_prob(x, method="std")) u_x = model1.compute_suff_stats(x) - assert_allclose(model1.log_prob(x, u_x, method='nat'), - model1.log_prob(x, method='std')) - + assert_allclose( + model1.log_prob(x, u_x, method="nat"), model1.log_prob(x, method="std") + ) + def test_elbo(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - assert_allclose(model1.elbo(x), - np.sum(model1.log_prob(x, method='std'))) - assert_allclose(model1.elbo(x, sample_weight=sample_weight), - 0.5*np.sum(model1.log_prob(x, method='std'))) - + sample_weight = 0.5 * np.ones((num_samples,)) + + assert_allclose(model1.elbo(x), np.sum(model1.log_prob(x, method="std"))) + assert_allclose( + model1.elbo(x, sample_weight=sample_weight), + 0.5 * np.sum(model1.log_prob(x, method="std")), + ) + def test_log_cdf(): model1 = create_pdf() - assert_allclose(model1.log_cdf(model1.mu), x_dim*np.log(0.5)) - assert model1.log_cdf(1e10*np.ones((x_dim,))) > np.log(0.99) - assert model1.log_cdf(-1e10*np.ones((x_dim,))) < np.log(0.01) - - + assert_allclose(model1.log_cdf(model1.mu), x_dim * np.log(0.5)) + assert model1.log_cdf(1e10 * np.ones((x_dim,))) > np.log(0.99) + assert model1.log_cdf(-1e10 * np.ones((x_dim,))) < np.log(0.01) + def test_fit(): @@ -152,7 +156,7 @@ def test_fit(): elbo = model2.fit(x, x_val=x_val) assert_allclose(model2.mu, np.mean(x, axis=0)) - assert_allclose(model2.Lambda, 1/np.std(x, axis=0)**2) + assert_allclose(model2.Lambda, 1 / np.std(x, axis=0) ** 2) assert_allclose(model1.mu, model2.mu, atol=0.01) assert_allclose(model1.Lambda, model2.Lambda, atol=0.01) assert_allclose(model1.eta, model2.eta, atol=0.01) @@ -161,31 +165,28 @@ def test_fit(): assert_allclose(elbo[3], np.mean(model1.log_prob(x_val)), rtol=1e-4) assert_allclose(elbo[1], np.mean(model2.log_prob(x)), rtol=1e-5) assert_allclose(elbo[3], np.mean(model2.log_prob(x_val)), rtol=1e-4) - + def test_plot(): - + model1 = create_pdf() model1.plot1D() - plt.savefig(output_dir + '/normal_1D.pdf') + plt.savefig(output_dir + "/normal_1D.pdf") plt.close() model1.plot2D() - plt.savefig(output_dir + '/normal_2D.pdf') + plt.savefig(output_dir + "/normal_2D.pdf") plt.close() model1.plot3D() - plt.savefig(output_dir + '/normal_3D.pdf') + plt.savefig(output_dir + "/normal_3D.pdf") plt.close() model1.plot3D_ellipsoid() - plt.savefig(output_dir + '/normal_3De.pdf') + plt.savefig(output_dir + "/normal_3De.pdf") plt.close() - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) - - diff --git a/tests/hyperion/pdfs/mixtures/test_gmm.py b/tests/hyperion/pdfs/mixtures/test_gmm.py index 34b42c9e..3f133167 100644 --- a/tests/hyperion/pdfs/mixtures/test_gmm.py +++ b/tests/hyperion/pdfs/mixtures/test_gmm.py @@ -7,7 +7,8 @@ import os import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from numpy.testing import assert_allclose @@ -16,57 +17,51 @@ from hyperion.utils.math import symmat2vec from hyperion.pdfs import GMMDiagCov, GMM -output_dir = './tests/data_out/pdfs/core/mixtures/gmm' +output_dir = "./tests/data_out/pdfs/core/mixtures/gmm" if not os.path.exists(output_dir): os.makedirs(output_dir) x_dim = 3 pi1 = np.array([0.5, 0.25, 0.125, 0.125]) -mu1 = np.array([[-2, 1.5, -1], - [1, 1, 0], - [0, -1, 1], - [1.5, -1.5, 0.5]]) -S1 = np.square(np.array([[1, 0.75, 0.5], - [0.5, 0.3, 0.1], - [0.5, 0.6, 0.7], - [0.5, 0.4, 0.3]])) - -S0fc = np.array([[1, 0.3, 0.1], - [0.3, 1, -0.25], - [0.1, -0.25, 1]]) +mu1 = np.array([[-2, 1.5, -1], [1, 1, 0], [0, -1, 1], [1.5, -1.5, 0.5]]) +S1 = np.square( + np.array([[1, 0.75, 0.5], [0.5, 0.3, 0.1], [0.5, 0.6, 0.7], [0.5, 0.4, 0.3]]) +) + +S0fc = np.array([[1, 0.3, 0.1], [0.3, 1, -0.25], [0.1, -0.25, 1]]) S1fc = np.zeros((len(pi1), x_dim, x_dim)) L1fc = np.zeros((len(pi1), x_dim, x_dim)) L1dc = np.zeros((len(pi1), x_dim, x_dim)) for k in range(len(pi1)): - SS = S1[k]*S0fc - S1fc[k] = (SS+SS.T)/2 + SS = S1[k] * S0fc + S1fc[k] = (SS + SS.T) / 2 L1fc[k] = la.inv(S1fc[k]) - L1dc[k] = np.diag(1/S1[k]) - + L1dc[k] = np.diag(1 / S1[k]) + num_samples = 1000 batch_size = 250 num_samples_init = 100 num_samples_train = 10000 -model_file = output_dir + '/model.h5' +model_file = output_dir + "/model.h5" def create_diag_pdf(): - model_diag = GMMDiagCov(num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=1/S1, x_dim=x_dim) + model_diag = GMMDiagCov( + num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=1 / S1, x_dim=x_dim + ) model = GMM(num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=L1dc, x_dim=x_dim) return model, model_diag - def create_pdf(): - + model = GMM(num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=L1fc, x_dim=x_dim) return model - def test_diag_properties(): model, model_diag = create_diag_pdf() @@ -75,8 +70,7 @@ def test_diag_properties(): for k in range(model.num_comp): assert_allclose(model.Sigma[k], np.diag(model_diag.Sigma[k])) assert_allclose(model.cholLambda[k], np.diag(np.sqrt(model_diag.Lambda[k]))) - - + def test_properties(): @@ -86,109 +80,107 @@ def test_properties(): for k in range(model.num_comp): assert_allclose(model.Sigma[k], la.inv(model.Lambda[k])) assert_allclose(model.cholLambda[k], la.cholesky(model.Lambda[k], lower=True)) - assert_allclose(model.logLambda[k], - 2*np.sum(np.log(np.diag(la.cholesky(model.Lambda[k]))))) - + assert_allclose( + model.logLambda[k], + 2 * np.sum(np.log(np.diag(la.cholesky(model.Lambda[k])))), + ) def test_diag_initialize(): - model1, model1_diag = create_diag_pdf() - model1.initialize() - model1_diag.initialize() - - model2 = GMM(num_comp=model1.num_comp, - pi=model1.pi, - eta=model1.eta, x_dim=model1.x_dim) - model2.initialize() - - assert_allclose(model1.compute_A_std(model1.mu, model1.Lambda), model1_diag.A) - assert_allclose(model1.compute_A_nat(model1.eta), model1_diag.A) - assert_allclose(model1.A, model1_diag.A) - assert_allclose(model2.A, model1_diag.A) - - assert_allclose(model1.mu, model1_diag.mu) - assert_allclose(model2.mu, model1_diag.mu) - - for k in range(model1.num_comp): - assert_allclose(model1.Lambda[k], np.diag(model1_diag.Lambda[k])) - assert_allclose(model2.Lambda[k], np.diag(model1_diag.Lambda[k])) - - - + model1, model1_diag = create_diag_pdf() + model1.initialize() + model1_diag.initialize() + + model2 = GMM( + num_comp=model1.num_comp, pi=model1.pi, eta=model1.eta, x_dim=model1.x_dim + ) + model2.initialize() + + assert_allclose(model1.compute_A_std(model1.mu, model1.Lambda), model1_diag.A) + assert_allclose(model1.compute_A_nat(model1.eta), model1_diag.A) + assert_allclose(model1.A, model1_diag.A) + assert_allclose(model2.A, model1_diag.A) + + assert_allclose(model1.mu, model1_diag.mu) + assert_allclose(model2.mu, model1_diag.mu) + + for k in range(model1.num_comp): + assert_allclose(model1.Lambda[k], np.diag(model1_diag.Lambda[k])) + assert_allclose(model2.Lambda[k], np.diag(model1_diag.Lambda[k])) + + def test_initialize(): - model1 = create_pdf() - model1.initialize() + model1 = create_pdf() + model1.initialize() - model2 = GMM(num_comp=model1.num_comp, - pi=model1.pi, - eta=model1.eta, x_dim=model1.x_dim) - assert_allclose(model1.eta, model2.eta) - model2.initialize() + model2 = GMM( + num_comp=model1.num_comp, pi=model1.pi, eta=model1.eta, x_dim=model1.x_dim + ) + assert_allclose(model1.eta, model2.eta) + model2.initialize() - model3 = GMM(num_comp=model2.num_comp, - pi=model2.pi, - mu=model2.mu, - Lambda=model2.Lambda, - x_dim=model1.x_dim) - assert_allclose(model1.eta, model3.eta, atol=1e-5) - model3.initialize() + model3 = GMM( + num_comp=model2.num_comp, + pi=model2.pi, + mu=model2.mu, + Lambda=model2.Lambda, + x_dim=model1.x_dim, + ) + assert_allclose(model1.eta, model3.eta, atol=1e-5) + model3.initialize() - assert_allclose(model1.eta, model2.eta, atol=1e-5) - assert_allclose(model1.eta, model3.eta, atol=1e-5) + assert_allclose(model1.eta, model2.eta, atol=1e-5) + assert_allclose(model1.eta, model3.eta, atol=1e-5) - assert_allclose(model1.A, model2.A) - assert_allclose(model1.A, model3.A) + assert_allclose(model1.A, model2.A) + assert_allclose(model1.A, model3.A) - assert_allclose(model1.mu, model2.mu, atol=1e-10) - assert_allclose(model1.mu, model3.mu, atol=1e-10) + assert_allclose(model1.mu, model2.mu, atol=1e-10) + assert_allclose(model1.mu, model3.mu, atol=1e-10) - assert_allclose(model1.Lambda, model2.Lambda) - assert_allclose(model1.Lambda, model3.Lambda) + assert_allclose(model1.Lambda, model2.Lambda) + assert_allclose(model1.Lambda, model3.Lambda) - def test_initialize_stdnormal(): model = GMM(num_comp=1, x_dim=x_dim) model.initialize() - - assert(model.pi==1) + + assert model.pi == 1 assert_allclose(model.mu, np.zeros((1, x_dim))) assert_allclose(model.Lambda[0], np.eye(x_dim)) - def test_initialize_kmeans(): model1 = create_pdf() x = model1.sample(num_samples=num_samples_init) - + model2 = GMM(num_comp=4, x_dim=x_dim) model2.initialize(x) print(model1.mu) - print(model2.mu[[2,3,1,0]]) + print(model2.mu[[2, 3, 1, 0]]) - def test_log_h(): - model1 = create_pdf() + model1 = create_pdf() + + sample_weight = np.arange(1, num_samples + 1, dtype=float) / num_samples - sample_weight = np.arange(1,num_samples+1, dtype=float)/num_samples - - assert(model1.log_h(None) == 0) - assert(model1.accum_log_h(None, sample_weight=sample_weight) == 0) + assert model1.log_h(None) == 0 + assert model1.accum_log_h(None, sample_weight=sample_weight) == 0 - def test_suff_stats(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) + sample_weight = 0.5 * np.ones((num_samples,)) xx = [] for i in range(x.shape[0]): @@ -199,56 +191,58 @@ def test_suff_stats(): N, u_x = model1.accum_suff_stats(x) - N1, u_x1 = model1.accum_suff_stats( - x, batch_size=batch_size) + N1, u_x1 = model1.accum_suff_stats(x, batch_size=batch_size) assert_allclose(N1, N) assert_allclose(u_x1, u_x) - N1, u_x1 = model1.accum_suff_stats( - x, sample_weight=sample_weight) - assert_allclose(N1, 0.5*N) - assert_allclose(u_x1, 0.5*u_x) + N1, u_x1 = model1.accum_suff_stats(x, sample_weight=sample_weight) + assert_allclose(N1, 0.5 * N) + assert_allclose(u_x1, 0.5 * u_x) N1, u_x1 = model1.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N1, 0.5*N) - assert_allclose(u_x1, 0.5*u_x) - + x, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N1, 0.5 * N) + assert_allclose(u_x1, 0.5 * u_x) + - def test_suff_stats_segments(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) - segments = np.array([[0, num_samples/2-1], - [num_samples/2, num_samples-1], - [0, num_samples/4-1], - [num_samples/4, num_samples/2-1], - [num_samples/2, 3*num_samples/4-1], - [3*num_samples/4, num_samples-1]]) + segments = np.array( + [ + [0, num_samples / 2 - 1], + [num_samples / 2, num_samples - 1], + [0, num_samples / 4 - 1], + [num_samples / 4, num_samples / 2 - 1], + [num_samples / 2, 3 * num_samples / 4 - 1], + [3 * num_samples / 4, num_samples - 1], + ] + ) print(N.shape) print(u_x.shape) - N1, u_x1 = model1.accum_suff_stats_segments( - x, segments, batch_size=batch_size) - assert_allclose(np.sum(N1, axis=0), 2*N) - assert_allclose(np.sum(u_x1, axis=0), 2*u_x) + N1, u_x1 = model1.accum_suff_stats_segments(x, segments, batch_size=batch_size) + assert_allclose(np.sum(N1, axis=0), 2 * N) + assert_allclose(np.sum(u_x1, axis=0), 2 * u_x) N2, u_x2 = model1.accum_suff_stats_segments( - x, segments, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, segments, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_segments( - x, segments, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - + x, segments, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_suff_stats_segments_prob(): @@ -256,32 +250,31 @@ def test_suff_stats_segments_prob(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) prob = np.zeros((num_samples, 4)) - prob[:int(num_samples/2), 0]=1 - prob[int(num_samples/2):int(3*num_samples/4), 1]=1 - prob[int(3*num_samples/4):int(4*num_samples/5), 2]=1 - prob[int(4*num_samples/5):, 3]=1 - - N1, u_x1 = model1.accum_suff_stats_segments_prob( - x, prob, batch_size=batch_size) + prob[: int(num_samples / 2), 0] = 1 + prob[int(num_samples / 2) : int(3 * num_samples / 4), 1] = 1 + prob[int(3 * num_samples / 4) : int(4 * num_samples / 5), 2] = 1 + prob[int(4 * num_samples / 5) :, 3] = 1 + + N1, u_x1 = model1.accum_suff_stats_segments_prob(x, prob, batch_size=batch_size) assert_allclose(np.sum(N1, axis=0), N) assert_allclose(np.sum(u_x1, axis=0), u_x) N2, u_x2 = model1.accum_suff_stats_segments_prob( - x, prob, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, prob, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_segments_prob( - x, prob, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - - + x, prob, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_suff_stats_sorttime(): @@ -289,28 +282,30 @@ def test_suff_stats_sorttime(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) - frame_length=int(num_samples/100) - frame_shift=frame_length - + frame_length = int(num_samples / 100) + frame_shift = frame_length + N1, u_x1 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, batch_size=batch_size) + x, frame_length, frame_shift, batch_size=batch_size + ) assert_allclose(np.sum(N1, axis=0), N) assert_allclose(np.sum(u_x1, axis=0), u_x) N2, u_x2 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, frame_length, frame_shift, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - + x, frame_length, frame_shift, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_diag_log_prob(): @@ -318,28 +313,21 @@ def test_diag_log_prob(): model1, model1_diag = create_diag_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, mode='nat'), - model1_diag.log_prob(x, mode='std')) - assert_allclose(model1.log_prob(x, mode='std'), - model1_diag.log_prob(x, mode='std')) + + assert_allclose(model1.log_prob(x, mode="nat"), model1_diag.log_prob(x, mode="std")) + assert_allclose(model1.log_prob(x, mode="std"), model1_diag.log_prob(x, mode="std")) - def test_log_prob(): model1 = create_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, mode='nat'), - model1.log_prob(x, mode='std')) - u_x = model1.compute_suff_stats(x) - assert_allclose(model1.log_prob(x, u_x, mode='nat'), - model1.log_prob(x, mode='std')) - + assert_allclose(model1.log_prob(x, mode="nat"), model1.log_prob(x, mode="std")) + u_x = model1.compute_suff_stats(x) + assert_allclose(model1.log_prob(x, u_x, mode="nat"), model1.log_prob(x, mode="std")) def test_diag_elbo(): @@ -347,27 +335,27 @@ def test_diag_elbo(): model1, model1_diag = create_diag_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + assert_allclose(model1.elbo(x), model1_diag.elbo(x)) - assert_allclose(model1.elbo(x, sample_weight=sample_weight), - 0.5*model1_diag.elbo(x)) + assert_allclose( + model1.elbo(x, sample_weight=sample_weight), 0.5 * model1_diag.elbo(x) + ) - def test_elbo(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - assert(model1.elbo(x)/num_samples + 0.4> np.mean(model1.log_prob(x, mode='std'))) - assert(model1.elbo(x, sample_weight=sample_weight)/num_samples + 0.2> - 0.5*np.sum(model1.log_prob(x, mode='std'))) - - - + sample_weight = 0.5 * np.ones((num_samples,)) + + assert model1.elbo(x) / num_samples + 0.4 > np.mean(model1.log_prob(x, mode="std")) + assert model1.elbo( + x, sample_weight=sample_weight + ) / num_samples + 0.2 > 0.5 * np.sum(model1.log_prob(x, mode="std")) + + def test_diag_fit_kmeans(): model1, _ = create_diag_pdf() @@ -384,26 +372,24 @@ def test_diag_fit_kmeans(): model2_diag.initialize(x) elbo_diag = model2_diag.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.plot(elbo_diag[1], 'g') - plt.plot(elbo_diag[3], 'g--') - plt.savefig(output_dir + '/fit_kmeans_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.plot(elbo_diag[1], "g") + plt.plot(elbo_diag[3], "g--") + plt.savefig(output_dir + "/fit_kmeans_init_elbo.pdf") plt.close() - def test_fit_kmeans(): model1 = create_pdf() @@ -416,25 +402,22 @@ def test_fit_kmeans(): model2.initialize(x) elbo = model2.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig('./tests/data_out/gmm_fit_kmeans_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig("./tests/data_out/gmm_fit_kmeans_init_elbo.pdf") plt.close() - - def test_fit_kmeans_split2(): model1 = create_pdf() @@ -451,24 +434,21 @@ def test_fit_kmeans_split2(): model2 = model2.split_comp(2) elbo = model2.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split2_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split2_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split2_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split2_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig('./tests/data_out/diag_gmm_fit_split2_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig("./tests/data_out/diag_gmm_fit_split2_init_elbo.pdf") plt.close() - - def test_fit_kmeans_split4(): @@ -484,23 +464,22 @@ def test_fit_kmeans_split4(): model2 = model2.split_comp(4) elbo = model2.fit(x, x_val=x_val) - model1.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split4_init_D01.pdf') + model1.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split4_init_D01.pdf") plt.close() - model1.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split4_init_D02.pdf') + model1.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split4_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_split4_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_split4_init_elbo.pdf") plt.close() - # def test_eval_logcdf(): # model1 = create_pdf() @@ -511,27 +490,25 @@ def test_fit_kmeans_split4(): def test_plot(): - + model1 = create_pdf() model1.plot1D() - plt.savefig(output_dir + '/plot_1D.pdf') + plt.savefig(output_dir + "/plot_1D.pdf") plt.close() - + model1.plot2D() - plt.savefig(output_dir + '/plot_2D.pdf') + plt.savefig(output_dir + "/plot_2D.pdf") plt.close() - + model1.plot3D() - plt.savefig(output_dir + '/plot_3D.pdf') + plt.savefig(output_dir + "/plot_3D.pdf") plt.close() - + model1.plot3D_ellipsoid() - plt.savefig(output_dir + '/plot_3De.pdf') + plt.savefig(output_dir + "/plot_3De.pdf") plt.close() -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) - - diff --git a/tests/hyperion/pdfs/mixtures/test_gmm_diag_cov.py b/tests/hyperion/pdfs/mixtures/test_gmm_diag_cov.py index 19063fef..4fdd2385 100644 --- a/tests/hyperion/pdfs/mixtures/test_gmm_diag_cov.py +++ b/tests/hyperion/pdfs/mixtures/test_gmm_diag_cov.py @@ -7,113 +7,107 @@ import os import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.pdfs import GMMDiagCov from numpy.testing import assert_allclose -output_dir = './tests/data_out/pdfs/core/mixtures/gmm_diag_cov' +output_dir = "./tests/data_out/pdfs/core/mixtures/gmm_diag_cov" if not os.path.exists(output_dir): os.makedirs(output_dir) x_dim = 3 pi1 = np.array([0.5, 0.25, 0.125, 0.125]) -mu1 = np.array([[-2, 1.5, -1], - [1, 1, 0], - [0, -1, 1], - [1.5, -1.5, 0.5]]) -S1 = np.square(np.array([[1, 0.75, 0.5], - [0.5, 0.3, 0.1], - [0.5, 0.6, 0.7], - [0.5, 0.4, 0.3]])) +mu1 = np.array([[-2, 1.5, -1], [1, 1, 0], [0, -1, 1], [1.5, -1.5, 0.5]]) +S1 = np.square( + np.array([[1, 0.75, 0.5], [0.5, 0.3, 0.1], [0.5, 0.6, 0.7], [0.5, 0.4, 0.3]]) +) num_samples = 1000 batch_size = 250 num_samples_init = 100 num_samples_train = 10000 -model_file = output_dir + '/model.h5' +model_file = output_dir + "/model.h5" def create_pdf(): - model = GMMDiagCov(num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=1/S1, x_dim=x_dim) + model = GMMDiagCov(num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=1 / S1, x_dim=x_dim) return model - + def test_properties(): model = create_pdf() assert_allclose(model.log_pi, np.log(model.pi)) - assert_allclose(model.Sigma, 1/model.Lambda) + assert_allclose(model.Sigma, 1 / model.Lambda) assert_allclose(model.cholLambda, np.sqrt(model.Lambda)) assert_allclose(model.logLambda, np.sum(np.log(model.Lambda), axis=-1)) - def test_initialize(): - model1 = create_pdf() - model1.initialize() + model1 = create_pdf() + model1.initialize() - model2 = GMMDiagCov(num_comp=model1.num_comp, - pi=model1.pi, - eta=model1.eta, x_dim=model1.x_dim) - model2.initialize() + model2 = GMMDiagCov( + num_comp=model1.num_comp, pi=model1.pi, eta=model1.eta, x_dim=model1.x_dim + ) + model2.initialize() - model3 = GMMDiagCov(num_comp=model2.num_comp, - pi=model2.pi, - mu=model2.mu, - Lambda=model2.Lambda, - x_dim=model1.x_dim) - model3.initialize() + model3 = GMMDiagCov( + num_comp=model2.num_comp, + pi=model2.pi, + mu=model2.mu, + Lambda=model2.Lambda, + x_dim=model1.x_dim, + ) + model3.initialize() - - assert_allclose(model1.eta, model2.eta) - assert_allclose(model1.eta, model3.eta) + assert_allclose(model1.eta, model2.eta) + assert_allclose(model1.eta, model3.eta) - assert_allclose(model1.A, model2.A) - assert_allclose(model1.A, model3.A) + assert_allclose(model1.A, model2.A) + assert_allclose(model1.A, model3.A) - assert_allclose(model1.mu, model2.mu) - assert_allclose(model1.mu, model3.mu) + assert_allclose(model1.mu, model2.mu) + assert_allclose(model1.mu, model3.mu) - assert_allclose(model1.Lambda, model2.Lambda) - assert_allclose(model1.Lambda, model3.Lambda) + assert_allclose(model1.Lambda, model2.Lambda) + assert_allclose(model1.Lambda, model3.Lambda) - def test_initialize_stdnormal(): model = GMMDiagCov(num_comp=1, x_dim=x_dim) model.initialize() - assert(model.pi==1) + assert model.pi == 1 assert_allclose(model.mu, np.zeros((1, x_dim))) assert_allclose(model.Lambda, np.ones((1, x_dim))) - def test_initialize_kmeans(): model1 = create_pdf() x = model1.sample(num_samples=num_samples_init) - + model2 = GMMDiagCov(num_comp=4, x_dim=x_dim) model2.initialize(x) - #print(model1.mu) - #print(model2.mu[[2,3,1,0]]) + # print(model1.mu) + # print(model2.mu[[2,3,1,0]]) - def test_log_h(): - model1 = create_pdf() + model1 = create_pdf() - sample_weight = np.arange(1,num_samples+1, dtype=float)/num_samples - - assert(model1.log_h(None) == 0) - assert(model1.accum_log_h(None, sample_weight=sample_weight) == 0) + sample_weight = np.arange(1, num_samples + 1, dtype=float) / num_samples + + assert model1.log_h(None) == 0 + assert model1.accum_log_h(None, sample_weight=sample_weight) == 0 def test_suff_stats(): @@ -121,63 +115,66 @@ def test_suff_stats(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - u_x = np.hstack((x, x*x)) + sample_weight = 0.5 * np.ones((num_samples,)) + + u_x = np.hstack((x, x * x)) assert_allclose(model1.compute_suff_stats(x), u_x) N, u_x = model1.accum_suff_stats(x) - N1, u_x1 = model1.accum_suff_stats( - x, batch_size=batch_size) + N1, u_x1 = model1.accum_suff_stats(x, batch_size=batch_size) assert_allclose(N1, N) assert_allclose(u_x1, u_x) - N1, u_x1 = model1.accum_suff_stats( - x, sample_weight=sample_weight) - assert_allclose(N1, 0.5*N) - assert_allclose(u_x1, 0.5*u_x) + N1, u_x1 = model1.accum_suff_stats(x, sample_weight=sample_weight) + assert_allclose(N1, 0.5 * N) + assert_allclose(u_x1, 0.5 * u_x) N1, u_x1 = model1.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N1, 0.5*N) - assert_allclose(u_x1, 0.5*u_x) - + x, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N1, 0.5 * N) + assert_allclose(u_x1, 0.5 * u_x) + - def test_suff_stats_segments(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) - segments = np.array([[0, num_samples/2-1], - [num_samples/2, num_samples-1], - [0, num_samples/4-1], - [num_samples/4, num_samples/2-1], - [num_samples/2, 3*num_samples/4-1], - [3*num_samples/4, num_samples-1]], dtype=int) + segments = np.array( + [ + [0, num_samples / 2 - 1], + [num_samples / 2, num_samples - 1], + [0, num_samples / 4 - 1], + [num_samples / 4, num_samples / 2 - 1], + [num_samples / 2, 3 * num_samples / 4 - 1], + [3 * num_samples / 4, num_samples - 1], + ], + dtype=int, + ) print(N.shape) print(u_x.shape) - N1, u_x1 = model1.accum_suff_stats_segments( - x, segments, batch_size=batch_size) - assert_allclose(np.sum(N1, axis=0), 2*N) - assert_allclose(np.sum(u_x1, axis=0), 2*u_x) + N1, u_x1 = model1.accum_suff_stats_segments(x, segments, batch_size=batch_size) + assert_allclose(np.sum(N1, axis=0), 2 * N) + assert_allclose(np.sum(u_x1, axis=0), 2 * u_x) N2, u_x2 = model1.accum_suff_stats_segments( - x, segments, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, segments, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_segments( - x, segments, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - + x, segments, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_suff_stats_segments_prob(): @@ -185,32 +182,31 @@ def test_suff_stats_segments_prob(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) prob = np.zeros((num_samples, 4)) - prob[:int(num_samples/2), 0]=1 - prob[int(num_samples/2):int(3*num_samples/4), 1]=1 - prob[int(3*num_samples/4):int(4*num_samples/5), 2]=1 - prob[int(4*num_samples/5):, 3]=1 - - N1, u_x1 = model1.accum_suff_stats_segments_prob( - x, prob, batch_size=batch_size) + prob[: int(num_samples / 2), 0] = 1 + prob[int(num_samples / 2) : int(3 * num_samples / 4), 1] = 1 + prob[int(3 * num_samples / 4) : int(4 * num_samples / 5), 2] = 1 + prob[int(4 * num_samples / 5) :, 3] = 1 + + N1, u_x1 = model1.accum_suff_stats_segments_prob(x, prob, batch_size=batch_size) assert_allclose(np.sum(N1, axis=0), N) assert_allclose(np.sum(u_x1, axis=0), u_x) N2, u_x2 = model1.accum_suff_stats_segments_prob( - x, prob, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, prob, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_segments_prob( - x, prob, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - - + x, prob, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_suff_stats_sorttime(): @@ -218,29 +214,30 @@ def test_suff_stats_sorttime(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) - frame_length=int(num_samples/100) - frame_shift=frame_length - + frame_length = int(num_samples / 100) + frame_shift = frame_length + N1, u_x1 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, batch_size=batch_size) + x, frame_length, frame_shift, batch_size=batch_size + ) assert_allclose(np.sum(N1, axis=0), N) assert_allclose(np.sum(u_x1, axis=0), u_x) N2, u_x2 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, frame_length, frame_shift, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - - + x, frame_length, frame_shift, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_log_prob(): @@ -248,36 +245,55 @@ def test_log_prob(): model1 = create_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, mode='nat'), - model1.log_prob(x, mode='std')) + + assert_allclose(model1.log_prob(x, mode="nat"), model1.log_prob(x, mode="std")) u_x = model1.compute_suff_stats(x) - assert_allclose(model1.log_prob(x, u_x, mode='nat'), - model1.log_prob(x, mode='std')) - + assert_allclose(model1.log_prob(x, u_x, mode="nat"), model1.log_prob(x, mode="std")) + def test_elbo(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - assert(model1.elbo(x)/num_samples + 0.4 > np.mean(model1.log_prob(x, mode='std'))) - assert(model1.elbo(x, sample_weight=sample_weight)/num_samples + 0.2> - 0.5*np.sum(model1.log_prob(x, mode='std'))) - - - + sample_weight = 0.5 * np.ones((num_samples,)) + + assert model1.elbo(x) / num_samples + 0.4 > np.mean(model1.log_prob(x, mode="std")) + assert model1.elbo( + x, sample_weight=sample_weight + ) / num_samples + 0.2 > 0.5 * np.sum(model1.log_prob(x, mode="std")) + + def test_log_cdf(): - model1 = create_pdf() + model1 = create_pdf() - assert(model1.log_cdf(1e20*np.ones((1,x_dim,))) > np.log(0.99)) - assert(model1.log_cdf(-1e20*np.ones((1,x_dim,))) < np.log(0.01)) + assert ( + model1.log_cdf( + 1e20 + * np.ones( + ( + 1, + x_dim, + ) + ) + ) + > np.log(0.99) + ) + assert ( + model1.log_cdf( + -1e20 + * np.ones( + ( + 1, + x_dim, + ) + ) + ) + < np.log(0.01) + ) - def test_fit_kmeans(): @@ -290,24 +306,22 @@ def test_fit_kmeans(): model2.initialize(x) elbo = model2.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_kmeans_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_kmeans_init_elbo.pdf") plt.close() - - + def test_fit_kmeans_split2(): model1 = create_pdf() @@ -323,23 +337,21 @@ def test_fit_kmeans_split2(): model2 = model2.split_comp(2) elbo = model2.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split2_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split2_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split2_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split2_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_split2_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_split2_init_elbo.pdf") plt.close() - def test_fit_kmeans_split4(): @@ -355,45 +367,42 @@ def test_fit_kmeans_split4(): model2 = model2.split_comp(4) elbo = model2.fit(x, x_val=x_val) - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split4_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split4_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split4_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split4_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_split4_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_split4_init_elbo.pdf") plt.close() - def test_plot(): - - model1 = create_pdf() - - model1.plot1D() - plt.savefig(output_dir + '/plot_1D.pdf') - plt.close() - model1.plot2D() - plt.savefig(output_dir + '/plot_2D.pdf') - plt.close() + model1 = create_pdf() - model1.plot3D() - plt.savefig(output_dir + '/plot_3D.pdf') - plt.close() + model1.plot1D() + plt.savefig(output_dir + "/plot_1D.pdf") + plt.close() - model1.plot3D_ellipsoid() - plt.savefig(output_dir + '/plot_3De.pdf') - plt.close() + model1.plot2D() + plt.savefig(output_dir + "/plot_2D.pdf") + plt.close() + model1.plot3D() + plt.savefig(output_dir + "/plot_3D.pdf") + plt.close() -if __name__ == '__main__': - pytest.main([__file__]) + model1.plot3D_ellipsoid() + plt.savefig(output_dir + "/plot_3De.pdf") + plt.close() +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/hyperion/pdfs/mixtures/test_gmm_tied_diag_cov.py b/tests/hyperion/pdfs/mixtures/test_gmm_tied_diag_cov.py index 39feb75d..b759aa77 100644 --- a/tests/hyperion/pdfs/mixtures/test_gmm_tied_diag_cov.py +++ b/tests/hyperion/pdfs/mixtures/test_gmm_tied_diag_cov.py @@ -7,171 +7,171 @@ import os import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from hyperion.pdfs import GMMTiedDiagCov from numpy.testing import assert_allclose -output_dir = './tests/data_out/pdfs/core/mixtures/gmm_tied_diag_cov' +output_dir = "./tests/data_out/pdfs/core/mixtures/gmm_tied_diag_cov" if not os.path.exists(output_dir): os.makedirs(output_dir) x_dim = 3 pi1 = np.array([0.5, 0.25, 0.125, 0.125]) -mu1 = np.array([[-2, 1.5, -1], - [1, 1, 0], - [0, -1, 1], - [1.5, -1.5, 0.5]]) +mu1 = np.array([[-2, 1.5, -1], [1, 1, 0], [0, -1, 1], [1.5, -1.5, 0.5]]) S1 = np.square(np.array([1, 0.75, 0.5])) num_samples = 1000 batch_size = 250 num_samples_init = 100 num_samples_train = 10000 -model_file = output_dir + '/model.h5' +model_file = output_dir + "/model.h5" def create_pdf(): - model = GMMTiedDiagCov(num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=1/S1, x_dim=x_dim) + model = GMMTiedDiagCov( + num_comp=len(pi1), pi=pi1, mu=mu1, Lambda=1 / S1, x_dim=x_dim + ) return model - + def test_properties(): model = create_pdf() assert_allclose(model.log_pi, np.log(model.pi)) - assert_allclose(model.Sigma, 1/model.Lambda) + assert_allclose(model.Sigma, 1 / model.Lambda) assert_allclose(model.cholLambda, np.sqrt(model.Lambda)) assert_allclose(model.logLambda, np.sum(np.log(model.Lambda), axis=-1)) - def test_initialize(): - model1 = create_pdf() - model1.initialize() + model1 = create_pdf() + model1.initialize() - model2 = GMMTiedDiagCov(num_comp=model1.num_comp, - pi=model1.pi, - eta=model1.eta, x_dim=model1.x_dim) - model2.initialize() + model2 = GMMTiedDiagCov( + num_comp=model1.num_comp, pi=model1.pi, eta=model1.eta, x_dim=model1.x_dim + ) + model2.initialize() - model3 = GMMTiedDiagCov(num_comp=model2.num_comp, - pi=model2.pi, - mu=model2.mu, - Lambda=model2.Lambda, - x_dim=model1.x_dim) - model3.initialize() + model3 = GMMTiedDiagCov( + num_comp=model2.num_comp, + pi=model2.pi, + mu=model2.mu, + Lambda=model2.Lambda, + x_dim=model1.x_dim, + ) + model3.initialize() - assert_allclose(model1.eta, model2.eta) - assert_allclose(model1.eta, model3.eta) + assert_allclose(model1.eta, model2.eta) + assert_allclose(model1.eta, model3.eta) - assert_allclose(model1.A, model2.A) - assert_allclose(model1.A, model3.A) + assert_allclose(model1.A, model2.A) + assert_allclose(model1.A, model3.A) - assert_allclose(model1.mu, model2.mu) - assert_allclose(model1.mu, model3.mu) + assert_allclose(model1.mu, model2.mu) + assert_allclose(model1.mu, model3.mu) - assert_allclose(model1.Lambda, model2.Lambda) - assert_allclose(model1.Lambda, model3.Lambda) + assert_allclose(model1.Lambda, model2.Lambda) + assert_allclose(model1.Lambda, model3.Lambda) - def test_initialize_stdnormal(): model = GMMTiedDiagCov(num_comp=1, x_dim=x_dim) model.initialize() - assert(model.pi==1) + assert model.pi == 1 assert_allclose(model.mu, np.zeros((1, x_dim))) assert_allclose(model.Lambda, np.ones((x_dim,))) - def test_initialize_kmeans(): model1 = create_pdf() x = model1.sample(num_samples=num_samples_init) - + model2 = GMMTiedDiagCov(num_comp=4, x_dim=x_dim) model2.initialize(x) - def test_log_h(): - model1 = create_pdf() + model1 = create_pdf() - sample_weight = np.arange(1,num_samples+1, dtype=float)/num_samples - - assert(model1.log_h(None) == 0) - assert(model1.accum_log_h(None, sample_weight=sample_weight) == 0) + sample_weight = np.arange(1, num_samples + 1, dtype=float) / num_samples + + assert model1.log_h(None) == 0 + assert model1.accum_log_h(None, sample_weight=sample_weight) == 0 - def test_suff_stats(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - u_x = np.hstack((x, x*x)) + sample_weight = 0.5 * np.ones((num_samples,)) + + u_x = np.hstack((x, x * x)) assert_allclose(model1.compute_suff_stats(x), u_x) N, u_x = model1.accum_suff_stats(x) - N1, u_x1 = model1.accum_suff_stats( - x, batch_size=batch_size) + N1, u_x1 = model1.accum_suff_stats(x, batch_size=batch_size) assert_allclose(N1, N) assert_allclose(u_x1, u_x) - N1, u_x1 = model1.accum_suff_stats( - x, sample_weight=sample_weight) - assert_allclose(N1, 0.5*N) - assert_allclose(u_x1, 0.5*u_x) + N1, u_x1 = model1.accum_suff_stats(x, sample_weight=sample_weight) + assert_allclose(N1, 0.5 * N) + assert_allclose(u_x1, 0.5 * u_x) N1, u_x1 = model1.accum_suff_stats( - x, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N1, 0.5*N) - assert_allclose(u_x1, 0.5*u_x) - + x, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N1, 0.5 * N) + assert_allclose(u_x1, 0.5 * u_x) + - def test_suff_stats_segments(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) - segments = np.array([[0, num_samples/2-1], - [num_samples/2, num_samples-1], - [0, num_samples/4-1], - [num_samples/4, num_samples/2-1], - [num_samples/2, 3*num_samples/4-1], - [3*num_samples/4, num_samples-1]], dtype=int) + segments = np.array( + [ + [0, num_samples / 2 - 1], + [num_samples / 2, num_samples - 1], + [0, num_samples / 4 - 1], + [num_samples / 4, num_samples / 2 - 1], + [num_samples / 2, 3 * num_samples / 4 - 1], + [3 * num_samples / 4, num_samples - 1], + ], + dtype=int, + ) print(N.shape) print(u_x.shape) - N1, u_x1 = model1.accum_suff_stats_segments( - x, segments, batch_size=batch_size) - assert_allclose(np.sum(N1, axis=0), 2*N) - assert_allclose(np.sum(u_x1, axis=0), 2*u_x) + N1, u_x1 = model1.accum_suff_stats_segments(x, segments, batch_size=batch_size) + assert_allclose(np.sum(N1, axis=0), 2 * N) + assert_allclose(np.sum(u_x1, axis=0), 2 * u_x) N2, u_x2 = model1.accum_suff_stats_segments( - x, segments, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, segments, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_segments( - x, segments, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - + x, segments, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_suff_stats_segments_prob(): @@ -179,31 +179,31 @@ def test_suff_stats_segments_prob(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) prob = np.zeros((num_samples, 4)) - prob[:int(num_samples/2), 0]=1 - prob[int(num_samples/2):int(3*num_samples/4), 1]=1 - prob[int(3*num_samples/4):int(4*num_samples/5), 2]=1 - prob[int(4*num_samples/5):, 3]=1 - - N1, u_x1 = model1.accum_suff_stats_segments_prob( - x, prob, batch_size=batch_size) + prob[: int(num_samples / 2), 0] = 1 + prob[int(num_samples / 2) : int(3 * num_samples / 4), 1] = 1 + prob[int(3 * num_samples / 4) : int(4 * num_samples / 5), 2] = 1 + prob[int(4 * num_samples / 5) :, 3] = 1 + + N1, u_x1 = model1.accum_suff_stats_segments_prob(x, prob, batch_size=batch_size) assert_allclose(np.sum(N1, axis=0), N) assert_allclose(np.sum(u_x1, axis=0), u_x) N2, u_x2 = model1.accum_suff_stats_segments_prob( - x, prob, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, prob, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_segments_prob( - x, prob, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - + x, prob, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_suff_stats_sorttime(): @@ -211,29 +211,30 @@ def test_suff_stats_sorttime(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - + sample_weight = 0.5 * np.ones((num_samples,)) + N, u_x = model1.accum_suff_stats(x) - frame_length=int(num_samples/100) - frame_shift=frame_length - + frame_length = int(num_samples / 100) + frame_shift = frame_length + N1, u_x1 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, batch_size=batch_size) + x, frame_length, frame_shift, batch_size=batch_size + ) assert_allclose(np.sum(N1, axis=0), N) assert_allclose(np.sum(u_x1, axis=0), u_x) N2, u_x2 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, sample_weight=sample_weight) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) + x, frame_length, frame_shift, sample_weight=sample_weight + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) N2, u_x2 = model1.accum_suff_stats_sorttime( - x, frame_length, frame_shift, sample_weight=sample_weight, batch_size=batch_size) - assert_allclose(N2, 0.5*N1) - assert_allclose(u_x2, 0.5*u_x1) - - + x, frame_length, frame_shift, sample_weight=sample_weight, batch_size=batch_size + ) + assert_allclose(N2, 0.5 * N1) + assert_allclose(u_x2, 0.5 * u_x1) def test_log_prob(): @@ -241,36 +242,55 @@ def test_log_prob(): model1 = create_pdf() x = model1.sample(num_samples) - - assert_allclose(model1.log_prob(x, mode='nat'), - model1.log_prob(x, mode='std')) + + assert_allclose(model1.log_prob(x, mode="nat"), model1.log_prob(x, mode="std")) u_x = model1.compute_suff_stats(x) - assert_allclose(model1.log_prob(x, u_x, mode='nat'), - model1.log_prob(x, mode='std')) - + assert_allclose(model1.log_prob(x, u_x, mode="nat"), model1.log_prob(x, mode="std")) + def test_elbo(): model1 = create_pdf() x = model1.sample(num_samples) - sample_weight = 0.5*np.ones((num_samples,)) - - assert(model1.elbo(x)/num_samples + 0.4> np.mean(model1.log_prob(x, mode='std'))) - assert(model1.elbo(x, sample_weight=sample_weight)/num_samples + 0.2> - 0.5*np.sum(model1.log_prob(x, mode='std'))) - - - + sample_weight = 0.5 * np.ones((num_samples,)) + + assert model1.elbo(x) / num_samples + 0.4 > np.mean(model1.log_prob(x, mode="std")) + assert model1.elbo( + x, sample_weight=sample_weight + ) / num_samples + 0.2 > 0.5 * np.sum(model1.log_prob(x, mode="std")) + + def test_log_cdf(): - model1 = create_pdf() + model1 = create_pdf() - assert(model1.log_cdf(1e20*np.ones((1,x_dim,))) > np.log(0.99)) - assert(model1.log_cdf(-1e20*np.ones((1,x_dim,))) < np.log(0.01)) + assert ( + model1.log_cdf( + 1e20 + * np.ones( + ( + 1, + x_dim, + ) + ) + ) + > np.log(0.99) + ) + assert ( + model1.log_cdf( + -1e20 + * np.ones( + ( + 1, + x_dim, + ) + ) + ) + < np.log(0.01) + ) - def test_fit_kmeans(): @@ -283,24 +303,22 @@ def test_fit_kmeans(): model2.initialize(x) elbo = model2.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_kmeans_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_kmeans_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_kmeans_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_kmeans_init_elbo.pdf") plt.close() - - + def test_fit_kmeans_split2(): model1 = create_pdf() @@ -316,23 +334,21 @@ def test_fit_kmeans_split2(): model2 = model2.split_comp(2) elbo = model2.fit(x, x_val=x_val) - - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split2_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split2_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split2_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split2_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_split2_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_split2_init_elbo.pdf") plt.close() - def test_fit_kmeans_split4(): @@ -348,45 +364,42 @@ def test_fit_kmeans_split4(): model2 = model2.split_comp(4) elbo = model2.fit(x, x_val=x_val) - model2.plot2D(feat_idx=[0,1], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split4_init_D01.pdf') + model2.plot2D(feat_idx=[0, 1], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split4_init_D01.pdf") plt.close() - model2.plot2D(feat_idx=[0,2], num_sigmas=1) - plt.savefig(output_dir + '/plot_fit_split4_init_D02.pdf') + model2.plot2D(feat_idx=[0, 2], num_sigmas=1) + plt.savefig(output_dir + "/plot_fit_split4_init_D02.pdf") plt.close() plt.figure() - plt.plot(np.repeat(model1.elbo(x)/x.shape[0], len(elbo[1])), 'b') - plt.plot(np.repeat(model1.elbo(x_val)/x.shape[0], len(elbo[1])), 'b--') - plt.plot(elbo[1], 'r') - plt.plot(elbo[3], 'r--') - plt.savefig(output_dir + '/fit_split4_init_elbo.pdf') + plt.plot(np.repeat(model1.elbo(x) / x.shape[0], len(elbo[1])), "b") + plt.plot(np.repeat(model1.elbo(x_val) / x.shape[0], len(elbo[1])), "b--") + plt.plot(elbo[1], "r") + plt.plot(elbo[3], "r--") + plt.savefig(output_dir + "/fit_split4_init_elbo.pdf") plt.close() - def test_plot(): - - model1 = create_pdf() - - model1.plot1D() - plt.savefig(output_dir + '/plot_1D.pdf') - plt.close() - model1.plot2D() - plt.savefig(output_dir + '/plot_2D.pdf') - plt.close() + model1 = create_pdf() - model1.plot3D() - plt.savefig(output_dir + '/plot_3D.pdf') - plt.close() + model1.plot1D() + plt.savefig(output_dir + "/plot_1D.pdf") + plt.close() - model1.plot3D_ellipsoid() - plt.savefig(output_dir + '/plot_3De.pdf') - plt.close() + model1.plot2D() + plt.savefig(output_dir + "/plot_2D.pdf") + plt.close() + model1.plot3D() + plt.savefig(output_dir + "/plot_3D.pdf") + plt.close() -if __name__ == '__main__': - pytest.main([__file__]) + model1.plot3D_ellipsoid() + plt.savefig(output_dir + "/plot_3De.pdf") + plt.close() +if __name__ == "__main__": + pytest.main([__file__]) diff --git a/tests/hyperion/pdfs/plda/test_frplda.py b/tests/hyperion/pdfs/plda/test_frplda.py index 6efd8d85..95ce3dfa 100644 --- a/tests/hyperion/pdfs/plda/test_frplda.py +++ b/tests/hyperion/pdfs/plda/test_frplda.py @@ -8,7 +8,8 @@ import pytest import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") import matplotlib.pyplot as plt from numpy.testing import assert_allclose @@ -23,12 +24,10 @@ num_spc = 10 mu = np.array([0.5, 0]) -Sb = np.array([[1, 0.05], - [0.05, 0.1]]) -Sw = np.array([[0.1, 0.05], - [0.05, 1]]) +Sb = np.array([[1, 0.05], [0.05, 0.1]]) +Sw = np.array([[0.1, 0.05], [0.05, 1]]) -output_dir = './tests/data_out/pdfs/plda/frplda' +output_dir = "./tests/data_out/pdfs/plda/frplda" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -40,23 +39,30 @@ def create_plda(): return plda -def plot_plda(plda, colors=['b','g','r'], linestyle='--', label=None): +def plot_plda(plda, colors=["b", "g", "r"], linestyle="--", label=None): hw = 0.05 hl = 0.1 - fc=colors[0] - ec=colors[0] + fc = colors[0] + ec = colors[0] ax = plt.gca() - ax.arrow(0,0,plda.mu[0],plda.mu[1], - head_width=hw, head_length=hl, fc=fc, ec=ec, - linestyle=linestyle, label=label) + ax.arrow( + 0, + 0, + plda.mu[0], + plda.mu[1], + head_width=hw, + head_length=hl, + fc=fc, + ec=ec, + linestyle=linestyle, + label=label, + ) Sb = la.inv(plda.B) pge2d(plda.mu, Sb, color=colors[1], linestyle=linestyle) Sw = la.inv(plda.W) pge2d(plda.mu, Sw, color=colors[2], linestyle=linestyle) - - def test_sample(): @@ -64,19 +70,18 @@ def test_sample(): x = plda.sample(num_classes, num_spc) spk_idx = [0, 1, 2, 3] - colors = ['b','r','g','m','c'] - markers = ['o','v','*','p','s'] + colors = ["b", "r", "g", "m", "c"] + markers = ["o", "v", "*", "p", "s"] plt.figure() for i in range(len(spk_idx)): idx = spk_idx[i] - x_i = x[idx*num_spc:(idx+1)*num_spc] - plt.scatter(x_i[:,0], x_i[:,1], 2, colors[i], markers[i]) + x_i = x[idx * num_spc : (idx + 1) * num_spc] + plt.scatter(x_i[:, 0], x_i[:, 1], 2, colors[i], markers[i]) - plt.savefig(output_dir + '/sample.pdf') + plt.savefig(output_dir + "/sample.pdf") plt.close() - def test_initialize(): plda1 = create_plda() @@ -88,38 +93,36 @@ def test_initialize(): plda2.initialize(D) plt.figure() - plot_plda(plda1, label='ground truth') - plot_plda(plda2, linestyle='--', label='trained') + plot_plda(plda1, label="ground truth") + plot_plda(plda2, linestyle="--", label="trained") plt.legend() plt.grid() - plt.savefig(output_dir + '/init.pdf') + plt.savefig(output_dir + "/init.pdf") plt.close() - def test_py(): - + plda = create_plda() - x,y = plda.sample(3, 3, seed=1024, return_y=True) + x, y = plda.sample(3, 3, seed=1024, return_y=True) class_ids = np.repeat(np.arange(3), 3) y_t = y[::3] D = plda.compute_stats_hard(x, class_ids) y, Cy = plda.compute_py_g_x(D, return_cov=True) - colors = ['b','r','g'] + colors = ["b", "r", "g"] plt.figure() for i in range(len(y_t)): - plt.plot(y_t[i,0], y_t[i,1], color=colors[i], marker='*') - plt.plot(y[i,0], y[i,1], color=colors[i], marker='s') + plt.plot(y_t[i, 0], y_t[i, 1], color=colors[i], marker="*") + plt.plot(y[i, 0], y[i, 1], color=colors[i], marker="s") pge2d(y[i], Cy[i], color=colors[i]) plt.grid() - plt.savefig(output_dir+'/py.pdf') + plt.savefig(output_dir + "/py.pdf") plt.close() - def test_fit(): plda1 = create_plda() @@ -132,17 +135,16 @@ def test_fit(): plt.figure() plt.plot(elbo) plt.grid() - plt.savefig(output_dir + '/fit_elbo.pdf') + plt.savefig(output_dir + "/fit_elbo.pdf") plt.close() plt.figure() - plot_plda(plda1, label='ground truth') - plot_plda(plda2, linestyle='--', label='trained') + plot_plda(plda1, label="ground truth") + plot_plda(plda2, linestyle="--", label="trained") plt.legend() plt.grid() - plt.savefig(output_dir + '/fit.pdf') + plt.savefig(output_dir + "/fit.pdf") plt.close() - def test_llr_1vs1(): @@ -153,7 +155,7 @@ def test_llr_1vs1(): x_t = x[1::2] tar = np.eye(num_classes, dtype=bool) non = np.logical_not(tar) - + scores = plda.llr_1vs1(x_e, x_t) scores_tar = scores[tar] scores_non = scores[non] @@ -161,17 +163,19 @@ def test_llr_1vs1(): assert np.mean(scores_tar) > np.mean(scores_non) plt.figure() - plt.hist(scores_tar, int(num_classes/10), density=True, label='tar', color='b') - plt.hist(scores_non, int(num_classes**2/20), density=True, label='non', color='r') + plt.hist(scores_tar, int(num_classes / 10), density=True, label="tar", color="b") + plt.hist( + scores_non, int(num_classes ** 2 / 20), density=True, label="non", color="r" + ) plt.grid() - plt.savefig(output_dir + '/llr_1vs1.pdf') + plt.savefig(output_dir + "/llr_1vs1.pdf") plt.close() - + def test_llrNvsM(): plt.figure() - + plda = create_plda() x = plda.sample(num_classes, 6, seed=1024) x_e = x[::2] @@ -181,117 +185,110 @@ def test_llrNvsM(): non = np.logical_not(tar) ## by the book - scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method='book') + scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method="book") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='book', color='b') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='b') + plt.hist(scores_tar, int(num_classes / 10), density=True, label="book", color="b") + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="b") ## score averaging - scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method='savg') + scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method="savg") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='s-avg', color='r') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='r') - + plt.hist(scores_tar, int(num_classes / 10), density=True, label="s-avg", color="r") + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="r") ## i-vector averaging - scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method='vavg') + scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method="vavg") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='iv-avg', color='g') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='g') - + plt.hist(scores_tar, int(num_classes / 10), density=True, label="iv-avg", color="g") + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="g") ## i-vector averaging - scores = plda.llr_NvsM(x_e, x_t, ids1=class_ids, ids2=class_ids, method='vavg-lnorm') + scores = plda.llr_NvsM( + x_e, x_t, ids1=class_ids, ids2=class_ids, method="vavg-lnorm" + ) scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='iv-avg+lnorm', color='m') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='m') + plt.hist( + scores_tar, int(num_classes / 10), density=True, label="iv-avg+lnorm", color="m" + ) + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="m") - plt.grid() - plt.savefig(output_dir + '/llr_NvsM.pdf') + plt.savefig(output_dir + "/llr_NvsM.pdf") plt.close() - def test_llrNvs1(): plt.figure() - + plda = create_plda() x = plda.sample(num_classes, 4, seed=1024) mask = np.zeros((len(x),), dtype=bool) mask[::4] = True - x_e = x[mask==False] + x_e = x[mask == False] x_t = x[mask] class_ids = np.repeat(np.arange(num_classes), 3) tar = np.eye(num_classes, dtype=bool) non = np.logical_not(tar) ## by the book - scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method='book') + scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method="book") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='book', color='b') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='b') + plt.hist(scores_tar, int(num_classes / 10), density=True, label="book", color="b") + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="b") ## score averaging - scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method='savg') + scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method="savg") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='s-avg', color='r') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='r') - + plt.hist(scores_tar, int(num_classes / 10), density=True, label="s-avg", color="r") + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="r") ## i-vector averaging - scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method='vavg') + scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method="vavg") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='iv-avg', color='g') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='g') - + plt.hist(scores_tar, int(num_classes / 10), density=True, label="iv-avg", color="g") + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="g") ## i-vector averaging - scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method='vavg-lnorm') + scores = plda.llr_Nvs1(x_e, x_t, ids1=class_ids, method="vavg-lnorm") scores_tar = scores[tar] scores_non = scores[non] assert np.mean(scores_tar) > np.mean(scores_non) - plt.hist(scores_tar, int(num_classes/10), density=True, label='iv-avg+lnorm', color='m') - plt.hist(scores_non, int(num_classes**2/20), density=True, color='m') + plt.hist( + scores_tar, int(num_classes / 10), density=True, label="iv-avg+lnorm", color="m" + ) + plt.hist(scores_non, int(num_classes ** 2 / 20), density=True, color="m") - plt.grid() - plt.savefig(output_dir + '/llr_Nvs1.pdf') + plt.savefig(output_dir + "/llr_Nvs1.pdf") plt.close() - - - - - - diff --git a/tests/hyperion/utils/test_kaldi_matrix.py b/tests/hyperion/utils/test_kaldi_matrix.py index c1cd4222..b9408f62 100644 --- a/tests/hyperion/utils/test_kaldi_matrix.py +++ b/tests/hyperion/utils/test_kaldi_matrix.py @@ -11,7 +11,7 @@ from hyperion.utils.kaldi_matrix import KaldiMatrix as KM from hyperion.utils.kaldi_matrix import KaldiCompressedMatrix as KCM -output_dir = './tests/data_out/utils/kaldi_matrix' +output_dir = "./tests/data_out/utils/kaldi_matrix" if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -19,309 +19,307 @@ def create_matrix(r=10, c=4): return np.random.randn(r, c) + def create_matrix_int(r=10, c=4): - return 32000*(np.random.rand(r, c)-0.5) + return 32000 * (np.random.rand(r, c) - 0.5) + def create_matrix_uint(r=10, c=4): - return 250*np.random.rand(r, c) + return 250 * np.random.rand(r, c) + def create_matrix_01(r=10, c=4): return np.random.rand(r, c) + def create_vector(d=10): return np.random.randn(d) def test_km_read_write(): - file_path = output_dir + '/km.mat' + file_path = output_dir + "/km.mat" # Test Matrix - mat1 = KM(create_matrix().astype('float32')) - with open(file_path, 'w') as f: + mat1 = KM(create_matrix().astype("float32")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=False) assert_allclose(mat1.data, mat2.data, atol=1e-4) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=True) assert_allclose(mat1.data, mat2.data) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=True, row_offset=5) assert_allclose(mat1.data[5:], mat2.data) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=True, row_offset=4, num_rows=4) assert_allclose(mat1.data[4:8], mat2.data) - - mat1 = KM(mat1.data.astype('float64')) - with open(file_path, 'w') as f: + mat1 = KM(mat1.data.astype("float64")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=False) assert_allclose(mat1.data, mat2.data, atol=1e-4) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=True) assert_allclose(mat1.data, mat2.data) # Test Vector - mat1 = KM(create_vector().astype('float32')) - with open(file_path, 'w') as f: + mat1 = KM(create_vector().astype("float32")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=False) assert_allclose(mat1.data, mat2.data, atol=1e-4) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=True) assert_allclose(mat1.data, mat2.data) - - mat1 = KM(mat1.data.astype('float64')) - with open(file_path, 'w') as f: + mat1 = KM(mat1.data.astype("float64")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=False) assert_allclose(mat1.data, mat2.data, atol=1e-4) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KM.read(f, binary=True) assert_allclose(mat1.data, mat2.data) - def test_km_read_shape(): - file_path = output_dir + '/km.mat' + file_path = output_dir + "/km.mat" # Test Matrix - mat1 = KM(create_matrix(10,4).astype('float32')) - with open(file_path, 'w') as f: + mat1 = KM(create_matrix(10, 4).astype("float32")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=False) == (10, 4) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=True) == (10, 4) - - mat1 = KM(mat1.data.astype('float64')) - with open(file_path, 'w') as f: + mat1 = KM(mat1.data.astype("float64")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=False) == (10, 4) - - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=True) == (10, 4) - # Test Vector - mat1 = KM(create_vector(10).astype('float32')) - with open(file_path, 'w') as f: + mat1 = KM(create_vector(10).astype("float32")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=False) == (10,) - - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=True) == (10,) - - mat1 = KM(mat1.data.astype('float64')) - with open(file_path, 'w') as f: + mat1 = KM(mat1.data.astype("float64")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=False) == (10,) - - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KM.read_shape(f, binary=True) == (10,) - def test_kcm_compress(): - mat1 = KM(create_matrix().astype('float32')) - cmat2 = KCM.compress(mat1, 'auto') + mat1 = KM(create_matrix().astype("float32")) + cmat2 = KCM.compress(mat1, "auto") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=0.025) - cmat2 = KCM.compress(mat1, 'speech-feat-t') + cmat2 = KCM.compress(mat1, "speech-feat-t") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=0.025) - cmat2 = KCM.compress(mat1, '2byte-auto') + cmat2 = KCM.compress(mat1, "2byte-auto") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=0.025) - cmat2 = KCM.compress(mat1, '1byte-auto') + cmat2 = KCM.compress(mat1, "1byte-auto") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=0.025) - - mat1 = KM(create_matrix(3,4).astype('float32')) - cmat2 = KCM.compress(mat1, 'auto') + mat1 = KM(create_matrix(3, 4).astype("float32")) + cmat2 = KCM.compress(mat1, "auto") mat2 = cmat2.to_matrix() - + assert_allclose(mat1.data, mat2.data, atol=0.025) - cmat2 = KCM.compress(mat1, 'speech-feat-t') + cmat2 = KCM.compress(mat1, "speech-feat-t") mat2 = cmat2.to_matrix() - + assert_allclose(mat1.data, mat2.data, atol=0.025) - mat1 = KM(create_matrix_int().astype('float32')) - cmat2 = KCM.compress(mat1, '2byte-signed-integer') + mat1 = KM(create_matrix_int().astype("float32")) + cmat2 = KCM.compress(mat1, "2byte-signed-integer") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=0.75) - mat1 = KM(create_matrix_uint().astype('float32')) - cmat2 = KCM.compress(mat1, '1byte-unsigned-integer') + mat1 = KM(create_matrix_uint().astype("float32")) + cmat2 = KCM.compress(mat1, "1byte-unsigned-integer") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=1.5) - - mat1 = KM(create_matrix_01().astype('float32')) - cmat2 = KCM.compress(mat1, '1byte-0-1') + mat1 = KM(create_matrix_01().astype("float32")) + cmat2 = KCM.compress(mat1, "1byte-0-1") mat2 = cmat2.to_matrix() - print(np.max(np.abs(mat1.data-mat2.data))) + print(np.max(np.abs(mat1.data - mat2.data))) assert_allclose(mat1.data, mat2.data, atol=0.025) - def test_kcm_read_write(): - - file_path = output_dir + '/kcm.mat' - mat1 = KCM.compress(create_matrix().astype('float32')) - with open(file_path, 'w') as f: + file_path = output_dir + "/kcm.mat" + + mat1 = KCM.compress(create_matrix().astype("float32")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=False) - assert_allclose(np.frombuffer(mat1.data, dtype=np.uint8), - np.frombuffer(mat2.data, dtype=np.uint8), atol=5) + assert_allclose( + np.frombuffer(mat1.data, dtype=np.uint8), + np.frombuffer(mat2.data, dtype=np.uint8), + atol=5, + ) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=True) - assert_allclose(np.frombuffer(mat1.data, dtype=np.uint8), - np.frombuffer(mat2.data, dtype=np.uint8)) + assert_allclose( + np.frombuffer(mat1.data, dtype=np.uint8), + np.frombuffer(mat2.data, dtype=np.uint8), + ) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=True, row_offset=5) print(mat1.to_ndarray()) print(mat2.to_ndarray()) assert_allclose(mat1.to_ndarray()[5:], mat2.to_ndarray()) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=True, row_offset=4, num_rows=4) assert_allclose(mat1.to_ndarray()[4:8], mat2.to_ndarray()) def test_kcm_read_write_fomat4(): - - file_path = output_dir + '/kcm.mat' - mat1 = KCM.compress(create_matrix().astype('float32'), method='speech-feat-t') - with open(file_path, 'w') as f: + file_path = output_dir + "/kcm.mat" + + mat1 = KCM.compress(create_matrix().astype("float32"), method="speech-feat-t") + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=False) - assert_allclose(np.frombuffer(mat1.data, dtype=np.uint8), - np.frombuffer(mat2.data, dtype=np.uint8), atol=5) + assert_allclose( + np.frombuffer(mat1.data, dtype=np.uint8), + np.frombuffer(mat2.data, dtype=np.uint8), + atol=5, + ) - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=True) - assert_allclose(np.frombuffer(mat1.data, dtype=np.uint8), - np.frombuffer(mat2.data, dtype=np.uint8)) + assert_allclose( + np.frombuffer(mat1.data, dtype=np.uint8), + np.frombuffer(mat2.data, dtype=np.uint8), + ) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=True, row_offset=5) print(mat1.to_ndarray()) print(mat2.to_ndarray()) assert_allclose(mat1.to_ndarray()[5:], mat2.to_ndarray()) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: mat2 = KCM.read(f, binary=True, row_offset=4, num_rows=4) assert_allclose(mat1.to_ndarray()[4:8], mat2.to_ndarray()) - def test_kcm_read_shape(): - - file_path = output_dir + '/kcm.mat' - mat1 = KCM.compress(create_matrix(10, 4).astype('float32')) - with open(file_path, 'w') as f: + file_path = output_dir + "/kcm.mat" + + mat1 = KCM.compress(create_matrix(10, 4).astype("float32")) + with open(file_path, "w") as f: mat1.write(f, binary=False) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KCM.read_shape(f, binary=False) == (10, 4) - - with open(file_path, 'wb') as f: + with open(file_path, "wb") as f: mat1.write(f, binary=True) - with open(file_path, 'rb') as f: + with open(file_path, "rb") as f: assert KCM.read_shape(f, binary=True) == (10, 4) - def test_kcm_getbuild_data_attrs(): - mat1 = KM(create_matrix().astype('float32')) - cmat2 = KCM.compress(mat1, 'auto') + mat1 = KM(create_matrix().astype("float32")) + cmat2 = KCM.compress(mat1, "auto") mat2 = cmat2.to_matrix() data, attrs = cmat2.get_data_attrs() cmat3 = KCM.build_from_data_attrs(data, attrs) @@ -329,8 +327,7 @@ def test_kcm_getbuild_data_attrs(): assert_allclose(mat2.data, mat3.data) - - cmat2 = KCM.compress(mat1, '2byte-auto') + cmat2 = KCM.compress(mat1, "2byte-auto") mat2 = cmat2.to_matrix() data, attrs = cmat2.get_data_attrs() cmat3 = KCM.build_from_data_attrs(data, attrs) @@ -338,8 +335,7 @@ def test_kcm_getbuild_data_attrs(): assert_allclose(mat2.data, mat3.data) - - cmat2 = KCM.compress(mat1, '1byte-auto') + cmat2 = KCM.compress(mat1, "1byte-auto") mat2 = cmat2.to_matrix() data, attrs = cmat2.get_data_attrs() cmat3 = KCM.build_from_data_attrs(data, attrs) @@ -350,8 +346,8 @@ def test_kcm_getbuild_data_attrs(): def test_kcm_getbuild_data_attrs_slice(): - mat1 = KM(create_matrix().astype('float32')) - cmat2 = KCM.compress(mat1, 'auto') + mat1 = KM(create_matrix().astype("float32")) + cmat2 = KCM.compress(mat1, "auto") mat2 = cmat2.to_matrix() data, attrs = cmat2.get_data_attrs() cmat3 = KCM.build_from_data_attrs(data[2:7], attrs) @@ -359,8 +355,7 @@ def test_kcm_getbuild_data_attrs_slice(): assert_allclose(mat2.data[2:7], mat3.data) - - cmat2 = KCM.compress(mat1, '2byte-auto') + cmat2 = KCM.compress(mat1, "2byte-auto") mat2 = cmat2.to_matrix() data, attrs = cmat2.get_data_attrs() cmat3 = KCM.build_from_data_attrs(data[2:7], attrs) @@ -368,8 +363,7 @@ def test_kcm_getbuild_data_attrs_slice(): assert_allclose(mat2.data[2:7], mat3.data) - - cmat2 = KCM.compress(mat1, '1byte-auto') + cmat2 = KCM.compress(mat1, "1byte-auto") mat2 = cmat2.to_matrix() data, attrs = cmat2.get_data_attrs() cmat3 = KCM.build_from_data_attrs(data[2:7], attrs) @@ -377,8 +371,6 @@ def test_kcm_getbuild_data_attrs_slice(): assert_allclose(mat2.data[2:7], mat3.data) - - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_list_utils.py b/tests/hyperion/utils/test_list_utils.py index 492958dd..a7432125 100644 --- a/tests/hyperion/utils/test_list_utils.py +++ b/tests/hyperion/utils/test_list_utils.py @@ -8,10 +8,11 @@ from hyperion.utils.list_utils import * + def create_lists(): - list1 = ['2', '2', '1', '2', '3', '3', '4', '4', '4', '4'] - list2 = ['0', '1', '2', '20'] + list1 = ["2", "2", "1", "2", "3", "3", "4", "4", "4", "4"] + list2 = ["0", "1", "2", "20"] return list1, list2 @@ -19,44 +20,47 @@ def test_ismember(): list1, list2 = create_lists() f, loc = ismember(list2, list1) - assert(np.all(loc == [np.iinfo(np.int32).min, 2, 0, np.iinfo(np.int32).min])) - assert(np.all(f == [False, True, True, False])) + assert np.all(loc == [np.iinfo(np.int32).min, 2, 0, np.iinfo(np.int32).min]) + assert np.all(f == [False, True, True, False]) + - def test_sort(): list1, list2 = create_lists() list1_s, idx = sort(list1, return_index=True) - assert(np.all(list1_s == ['1', '2', '2', '2', '3', '3', '4', '4', '4', '4'])) + assert np.all(list1_s == ["1", "2", "2", "2", "3", "3", "4", "4", "4", "4"]) + - def test_unique(): list1, list2 = create_lists() - list1_u, i_a, i_b, = np.unique(list1, True, True) - assert(np.all(list1_u == ['1', '2', '3', '4'])) - assert(np.all(i_a == [2, 0, 4, 6])) - assert(np.all(i_b == [1, 1, 0, 1, 2, 2, 3, 3, 3, 3])) + ( + list1_u, + i_a, + i_b, + ) = np.unique(list1, True, True) + assert np.all(list1_u == ["1", "2", "3", "4"]) + assert np.all(i_a == [2, 0, 4, 6]) + assert np.all(i_b == [1, 1, 0, 1, 2, 2, 3, 3, 3, 3]) + - def test_intersect(): list1, list2 = create_lists() list1_u = np.unique(list1) list_i, i_a, i_b = intersect(list2, list1_u, return_index=True) - assert(np.all(list_i == ['1', '2'])) - assert(np.all(i_a == [1, 2])) - assert(np.all(i_b == [0, 1])) + assert np.all(list_i == ["1", "2"]) + assert np.all(i_a == [1, 2]) + assert np.all(i_b == [0, 1]) + - def test_setdiff(): list1, list2 = create_lists() list1_d = np.setdiff1d(list1, list2) - assert(np.all(list1_d == ['3', '4'])) + assert np.all(list1_d == ["3", "4"]) list2_d = np.setdiff1d(list2, list1) - assert(np.all(list2_d == ['0', '20'])) - + assert np.all(list2_d == ["0", "20"]) def test_split(): @@ -66,34 +70,33 @@ def test_split(): print(list1) print(list_s) # ['2', '2', '1', '2', '3', '3', '4', '4', '4', '4'] - assert(np.all(list_s == ['2', '2', '1'])) - assert(np.all(loc == [0, 1, 2])) - + assert np.all(list_s == ["2", "2", "1"]) + assert np.all(loc == [0, 1, 2]) + list_s, loc = split_list(list1, 2, 3) - assert(np.all(list_s == ['2', '3', '3'])) - assert(np.all(loc == [3, 4, 5])) + assert np.all(list_s == ["2", "3", "3"]) + assert np.all(loc == [3, 4, 5]) list_s, loc = split_list(list1, 3, 3) - assert(np.all(list_s == ['4', '4', '4', '4'])) - assert(np.all(loc == [i for i in range(6, 10)])) + assert np.all(list_s == ["4", "4", "4", "4"]) + assert np.all(loc == [i for i in range(6, 10)]) - def test_split_by_key(): list1, list2 = create_lists() list_s, loc = split_list_group_by_key(list1, 1, 3) - assert(np.all(list_s == ['1'])) - assert(np.all(loc == [2])) - + assert np.all(list_s == ["1"]) + assert np.all(loc == [2]) + list_s, loc = split_list_group_by_key(list1, 2, 3) - assert(np.all(list_s == ['2', '2', '2'])) - assert(np.all(loc == [0, 1, 3])) + assert np.all(list_s == ["2", "2", "2"]) + assert np.all(loc == [0, 1, 3]) list_s, loc = split_list_group_by_key(list1, 3, 3) - assert(np.all(list_s == ['3', '3', '4', '4', '4', '4'])) - assert(np.all(loc == [i for i in range(4, 10)])) + assert np.all(list_s == ["3", "3", "4", "4", "4", "4"]) + assert np.all(loc == [i for i in range(4, 10)]) -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_math.py b/tests/hyperion/utils/test_math.py index 96a5e903..e5a48cc3 100644 --- a/tests/hyperion/utils/test_math.py +++ b/tests/hyperion/utils/test_math.py @@ -9,9 +9,10 @@ from hyperion.utils.math import * + def create_matrices(dim): - x1 = np.random.randn(dim*10, dim) - x2 = np.random.randn(dim*10, dim) + x1 = np.random.randn(dim * 10, dim) + x2 = np.random.randn(dim * 10, dim) A = np.dot(x1.T, x1) return A, x1, x2 @@ -24,6 +25,7 @@ def test_logdet_pdmat(dim=10): logA = logdet_pdmat(A) assert np.allclose(logA_t, logA) + def test_invert_pdmat_leftinv(dim=10): A, x1, x2 = create_matrices(dim) @@ -31,18 +33,18 @@ def test_invert_pdmat_leftinv(dim=10): logA_t = np.log(la.det(A)) invAx2_t = np.dot(invA_t, x2.T) - # test invert_pdmat invA_f, RA, logA, invA = invert_pdmat( - A, right_inv=False, return_logdet=True, return_inv=True) - + A, right_inv=False, return_logdet=True, return_inv=True + ) + invAx2 = invA_f(x2.T) - + assert np.allclose(logA_t, logA) assert np.allclose(invA_t, invA) assert np.allclose(invAx2_t, invAx2) - + def test_invert_pdmat_rightinv(dim=10): A, x1, x2 = create_matrices(dim) @@ -51,15 +53,16 @@ def test_invert_pdmat_rightinv(dim=10): x2invA_t = np.dot(x2, invA_t) invA_f, RA, logA, invA = invert_pdmat( - A, right_inv=True, return_logdet=True, return_inv=True) + A, right_inv=True, return_logdet=True, return_inv=True + ) x2invA = invA_f(x2) - + assert np.allclose(logA_t, logA) assert np.allclose(invA_t, invA) assert np.allclose(x2invA_t, x2invA) - + # test invert_trimat upper triangular def test_invert_uppertrimat_leftinv(dim=10): @@ -67,82 +70,84 @@ def test_invert_uppertrimat_leftinv(dim=10): RA = invert_pdmat(A)[1] - B=RA + B = RA invB_t = la.inv(B) logB_t = np.log(la.det(B)) invBx2_t = np.dot(invB_t, x2.T) - + invB_f, logB, invB = invert_trimat( - B, lower=False, right_inv=False, return_logdet=True, return_inv=True) + B, lower=False, right_inv=False, return_logdet=True, return_inv=True + ) invBx2 = invB_f(x2.T) - + assert np.allclose(logB_t, logB) assert np.allclose(invB_t, invB) assert np.allclose(invBx2_t, invBx2) - + def test_invert_uppertrimat_rightinv(dim=10): - + A, x1, x2 = create_matrices(dim) RA = invert_pdmat(A)[1] - B=RA + B = RA invB_t = la.inv(B) logB_t = np.log(la.det(B)) - + x2invB_t = np.dot(x2, invB_t) invB_f, logB, invB = invert_trimat( - B, lower=False, right_inv=True, return_logdet=True, return_inv=True) + B, lower=False, right_inv=True, return_logdet=True, return_inv=True + ) x2invB = invB_f(x2) - + assert np.allclose(logB_t, logB) assert np.allclose(invB_t, invB) assert np.allclose(x2invB_t, x2invB) -# test invert_trimat lower triangular +# test invert_trimat lower triangular def test_invert_lowertrimat_leftinv(dim=10): - + A, x1, x2 = create_matrices(dim) RA = invert_pdmat(A)[1] - C=RA.T + C = RA.T invC_t = la.inv(C) logC_t = np.log(la.det(C)) invCx2_t = np.dot(invC_t, x2.T) - invC_f, logC, invC = invert_trimat( - C, lower=True, right_inv=False, return_logdet=True, return_inv=True) + C, lower=True, right_inv=False, return_logdet=True, return_inv=True + ) invCx2 = invC_f(x2.T) - + assert np.allclose(logC_t, logC) assert np.allclose(invC_t, invC) assert np.allclose(invCx2_t, invCx2) - def test_invert_lowertrimat_rightinv(dim=10): - + A, x1, x2 = create_matrices(dim) RA = invert_pdmat(A)[1] - C=RA.T + C = RA.T invC_t = la.inv(C) logC_t = np.log(la.det(C)) x2invC_t = np.dot(x2, invC_t) invC_f, logC, invC = invert_trimat( - C, lower=True, right_inv=True, return_logdet=True, return_inv=True) + C, lower=True, right_inv=True, return_logdet=True, return_inv=True + ) x2invC = invC_f(x2) - + assert np.allclose(logC_t, logC) assert np.allclose(invC_t, invC) assert np.allclose(x2invC_t, x2invC) @@ -151,33 +156,31 @@ def test_invert_lowertrimat_rightinv(dim=10): def test_softmax(dim=10): # test softmax rng = np.random.RandomState(seed=0) - y_t = rng.uniform(low=0., high=1.0, size=(dim*10, dim)) + y_t = rng.uniform(low=0.0, high=1.0, size=(dim * 10, dim)) y_t /= np.sum(y_t, axis=-1, keepdims=True) - z = np.log(y_t)+10 + z = np.log(y_t) + 10 y = softmax(z) assert np.allclose(y_t, y) - def test_logsumexp(dim=10): # test softmax rng = np.random.RandomState(seed=0) - y_t = rng.uniform(low=0., high=1.0, size=(dim*10, dim)) + y_t = rng.uniform(low=0.0, high=1.0, size=(dim * 10, dim)) z = np.log(y_t) - y_t = np.log(np.sum(y_t, axis=-1)+1e-20) + y_t = np.log(np.sum(y_t, axis=-1) + 1e-20) y = logsumexp(z) assert np.allclose(y_t, y, rtol=1e-5) - + # test fisher ratio def test_fisher_ratio(dim=10): A = create_matrices(dim)[0] - invA = invert_pdmat( - A, right_inv=False, return_logdet=False, return_inv=True)[-1] - + invA = invert_pdmat(A, right_inv=False, return_logdet=False, return_inv=True)[-1] + mu1 = np.random.randn(dim) mu2 = np.random.randn(dim) r1 = fisher_ratio(mu1, A, mu2, A) @@ -189,7 +192,7 @@ def test_fisher_ratio(dim=10): def test_symmat2vec(dim=10): A = create_matrices(dim)[0] - + v = symmat2vec(A, lower=False) Ac = vec2symmat(v, lower=False) assert np.allclose(A, Ac) @@ -198,13 +201,13 @@ def test_symmat2vec(dim=10): Ac = vec2symmat(v, lower=True) assert np.allclose(A, Ac) - + def test_trimat2vec(dim=10): A = create_matrices(dim)[0] B = la.cholesky(A, lower=False) C = B.T - + v = trimat2vec(B, lower=False) Bc = vec2trimat(v, lower=False) assert np.allclose(B, Bc) @@ -213,49 +216,47 @@ def test_trimat2vec(dim=10): Cc = vec2trimat(v, lower=True) assert np.allclose(C, Cc) - + # test fullcov flooring def test_fullcov_varfloor(dim=10): A = create_matrices(dim)[0] - u, d, _= la.svd(A, full_matrices=False) - assert np.allclose(A, np.dot(u*d, u.T)) - d1=d - d1[int(dim/2):]=0.0001 - D1=np.dot(u*d1, u.T) - - F=A - RF=la.cholesky(F) - DF_1=fullcov_varfloor(D1, RF, F_is_chol=True, lower=False) - - RF=la.cholesky(F).T - DF_2=fullcov_varfloor(D1, RF, F_is_chol=True, lower=True) + u, d, _ = la.svd(A, full_matrices=False) + assert np.allclose(A, np.dot(u * d, u.T)) + d1 = d + d1[int(dim / 2) :] = 0.0001 + D1 = np.dot(u * d1, u.T) + + F = A + RF = la.cholesky(F) + DF_1 = fullcov_varfloor(D1, RF, F_is_chol=True, lower=False) + + RF = la.cholesky(F).T + DF_2 = fullcov_varfloor(D1, RF, F_is_chol=True, lower=True) assert np.allclose(DF_1, F) assert np.allclose(DF_1, F) - - def test_fullcov_varfloor_from_cholS(dim=10): A = create_matrices(dim)[0] - u, d, _= la.svd(A, full_matrices=False) - assert np.allclose(A, np.dot(u*d, u.T)) - d1=d - d1[int(dim/2):]=0.0001 - D1=np.dot(u*d1, u.T) - - F=A - RD1=la.cholesky(D1) - RF=la.cholesky(F) - RF_1=fullcov_varfloor_from_cholS(RD1, RF, lower=False) - - RD1=la.cholesky(D1).T - RF=la.cholesky(F).T - RF_2=fullcov_varfloor_from_cholS(RD1, RF, lower=True) + u, d, _ = la.svd(A, full_matrices=False) + assert np.allclose(A, np.dot(u * d, u.T)) + d1 = d + d1[int(dim / 2) :] = 0.0001 + D1 = np.dot(u * d1, u.T) + + F = A + RD1 = la.cholesky(D1) + RF = la.cholesky(F) + RF_1 = fullcov_varfloor_from_cholS(RD1, RF, lower=False) + + RD1 = la.cholesky(D1).T + RF = la.cholesky(F).T + RF_2 = fullcov_varfloor_from_cholS(RD1, RF, lower=True) assert np.allclose(RF, RF_2) assert np.allclose(RF_1, RF_2.T) - -if __name__ == '__main__': + +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_plotting.py b/tests/hyperion/utils/test_plotting.py index a9395381..142710ff 100644 --- a/tests/hyperion/utils/test_plotting.py +++ b/tests/hyperion/utils/test_plotting.py @@ -8,51 +8,51 @@ import numpy as np import matplotlib -matplotlib.use('Agg') + +matplotlib.use("Agg") from hyperion.utils.plotting import * -output_dir = './tests/data_out/utils/plot' +output_dir = "./tests/data_out/utils/plot" if not os.path.exists(output_dir): os.makedirs(output_dir) def test_plot_gaussian(): - - mu=np.array([1, 2, 3]) - C=np.array([[2, 0.5, 0.2], [0.5, 1., 0.1], [0.2, 0.1, 0.8]]) + + mu = np.array([1, 2, 3]) + C = np.array([[2, 0.5, 0.2], [0.5, 1.0, 0.1], [0.2, 0.1, 0.8]]) la.cholesky(C) mu1 = mu[0] - C1 = C[0,0] - #plt.figure(figsize=(6, 6)) + C1 = C[0, 0] + # plt.figure(figsize=(6, 6)) plot_gaussian_1D(mu1, C1) # plt.show() - plt.savefig(output_dir + '/plot_gaussian_1D.pdf') + plt.savefig(output_dir + "/plot_gaussian_1D.pdf") plt.close() - + mu2 = mu[:2] - C2 = C[:2,:2] + C2 = C[:2, :2] fig = plt.figure(figsize=(6, 6)) - ax = fig.add_subplot(111, projection='3d') + ax = fig.add_subplot(111, projection="3d") plot_gaussian_3D(mu2, C2, ax=ax) # plt.show() - plt.savefig(output_dir + '/plot_gaussian_3D.pdf') + plt.savefig(output_dir + "/plot_gaussian_3D.pdf") plt.close() - - #plt.figure(figsize=(6, 6)) + + # plt.figure(figsize=(6, 6)) plot_gaussian_ellipsoid_2D(mu2, C2) - #plt.show() - plt.savefig(output_dir + '/plot_gaussian_ellipsoid_2D.pdf') + # plt.show() + plt.savefig(output_dir + "/plot_gaussian_ellipsoid_2D.pdf") plt.close() - + fig = plt.figure(figsize=(6, 6)) - ax = fig.add_subplot(111, projection='3d') + ax = fig.add_subplot(111, projection="3d") plot_gaussian_ellipsoid_3D(mu, C, ax=ax) # plt.show() - plt.savefig(output_dir + '/plot_gaussian_ellipsoid_3D.pdf') + plt.savefig(output_dir + "/plot_gaussian_ellipsoid_3D.pdf") plt.close() - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_scp_list.py b/tests/hyperion/utils/test_scp_list.py index ad581983..1d2fa976 100644 --- a/tests/hyperion/utils/test_scp_list.py +++ b/tests/hyperion/utils/test_scp_list.py @@ -9,38 +9,38 @@ from hyperion.utils.scp_list import SCPList -output_dir = './tests/data_out/utils/scp_list' +output_dir = "./tests/data_out/utils/scp_list" if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(output_dir) def create_scp(): - - key = ['spk1']+['spk2']*2+['spk3']*3+['spk10']*10 - file_path = np.arange(len(key)).astype('U') + + key = ["spk1"] + ["spk2"] * 2 + ["spk3"] * 3 + ["spk10"] * 10 + file_path = np.arange(len(key)).astype("U") scp = SCPList(key, file_path) scp.sort() return scp def create_scp_with_offset(): - - key = ['spk1']+['spk2']*2+['spk3']*3+['spk10']*10 - file_path = np.arange(len(key)).astype('U') - offset = np.arange(len(key), dtype=np.int32)*10 + + key = ["spk1"] + ["spk2"] * 2 + ["spk3"] * 3 + ["spk10"] * 10 + file_path = np.arange(len(key)).astype("U") + offset = np.arange(len(key), dtype=np.int32) * 10 scp = SCPList(key, file_path, offset) scp.sort() return scp def create_scp_with_offset_range(): - - key = ['spk1']+['spk2']*2+['spk3']*3+['spk10']*10 - file_path = np.arange(len(key)).astype('U') - offset = np.arange(len(key), dtype=np.int32)*10 + + key = ["spk1"] + ["spk2"] * 2 + ["spk3"] * 3 + ["spk10"] * 10 + file_path = np.arange(len(key)).astype("U") + offset = np.arange(len(key), dtype=np.int32) * 10 range_spec = np.zeros((len(key), 2), dtype=np.int32) - range_spec[3:,0] = 5 - range_spec[10:,1] = 10 + range_spec[3:, 0] = 5 + range_spec[10:, 1] = 10 scp = SCPList(key, file_path, offset, range_spec) scp.sort() return scp @@ -59,38 +59,38 @@ def test_cmp(): scp2 = create_scp_with_offset_range() assert scp1 == scp2 - scp1 = SCPList([],[]) - scp2 = SCPList([],[]) + scp1 = SCPList([], []) + scp2 = SCPList([], []) assert scp1 == scp2 - + def test_save_load(): - file_txt = './tests/data_out/list.scp' + file_txt = "./tests/data_out/list.scp" scp1 = create_scp() scp1.save(file_txt) scp2 = SCPList.load(file_txt) assert scp1 == scp2 - file_txt = './tests/data_out/list_offset.scp' + file_txt = "./tests/data_out/list_offset.scp" scp1 = create_scp_with_offset() scp1.save(file_txt) scp2 = SCPList.load(file_txt) assert scp1 == scp2 - file_txt = './tests/data_out/list_offsetrange.scp' + file_txt = "./tests/data_out/list_offsetrange.scp" scp1 = create_scp_with_offset_range() scp1.save(file_txt) scp2 = SCPList.load(file_txt) assert scp1 == scp2 - + def test_split_merge(): scp1 = create_scp() - num_parts=3 + num_parts = 3 scp_list = [] for i in range(num_parts): - scp_i = scp1.split(i+1, num_parts) + scp_i = scp1.split(i + 1, num_parts) scp_list.append(scp_i) assert scp_list[0].len() == 1 @@ -100,13 +100,12 @@ def test_split_merge(): scp2 = SCPList.merge(scp_list) assert scp1 == scp2 - scp1 = create_scp_with_offset() - num_parts=3 + num_parts = 3 scp_list = [] for i in range(num_parts): - scp_i = scp1.split(i+1, num_parts) + scp_i = scp1.split(i + 1, num_parts) scp_list.append(scp_i) assert scp_list[0].len() == 1 @@ -116,13 +115,12 @@ def test_split_merge(): scp2 = SCPList.merge(scp_list) assert scp1 == scp2 - scp1 = create_scp_with_offset_range() - num_parts=3 + num_parts = 3 scp_list = [] for i in range(num_parts): - scp_i = scp1.split(i+1, num_parts) + scp_i = scp1.split(i + 1, num_parts) scp_list.append(scp_i) assert scp_list[0].len() == 1 @@ -133,38 +131,34 @@ def test_split_merge(): assert scp1 == scp2 - def test_filter(): - filter_key = ['spk2', 'spk10'] + filter_key = ["spk2", "spk10"] scp1 = create_scp() scp2 = scp1.filter(filter_key) - - f = np.zeros(len(scp1.key), dtype='bool') + + f = np.zeros(len(scp1.key), dtype="bool") f[1:13] = True scp3 = SCPList(scp1.key[f], scp1.file_path[f]) - + assert scp2 == scp3 scp1 = create_scp_with_offset_range() scp2 = scp1.filter(filter_key) - - f = np.zeros(len(scp1.key), dtype='bool') + + f = np.zeros(len(scp1.key), dtype="bool") f[1:13] = True - scp3 = SCPList(scp1.key[f], scp1.file_path[f], - scp1.offset[f], scp1.range_spec[f]) + scp3 = SCPList(scp1.key[f], scp1.file_path[f], scp1.offset[f], scp1.range_spec[f]) print(scp2.__dict__) print(scp3.__dict__) assert scp2 == scp3 - - filter_key=[] + filter_key = [] scp2 = scp1.filter(filter_key) - scp3 = SCPList([],[]) + scp3 = SCPList([], []) assert scp2 == scp3 - def test_shuffle(): scp1 = create_scp() @@ -184,13 +178,13 @@ def test_shuffle(): def test_getitem(): scp1 = create_scp() - assert scp1[1][1] == '6' + assert scp1[1][1] == "6" + + assert scp1["spk1"][0] == "0" + assert scp1["spk10"][0] == "15" - assert scp1['spk1'][0] == '0' - assert scp1['spk10'][0] == '15' + assert "spk1" in scp1 - assert 'spk1' in scp1 - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_sparse_trial_key.py b/tests/hyperion/utils/test_sparse_trial_key.py index 8bb39877..81c35100 100644 --- a/tests/hyperion/utils/test_sparse_trial_key.py +++ b/tests/hyperion/utils/test_sparse_trial_key.py @@ -11,12 +11,12 @@ from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.sparse_trial_key import SparseTrialKey -output_dir = './tests/data_out/utils/trial' +output_dir = "./tests/data_out/utils/trial" if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(output_dir) - -def create_key(key_file='./tests/data_in/core-core_det5_key.h5'): + +def create_key(key_file="./tests/data_in/core-core_det5_key.h5"): key = TrialKey.load(key_file) key.sort() @@ -29,10 +29,10 @@ def test_copy(): key1 = create_key() key2 = key1.copy() - key2.model_set[0] = 'm1' + key2.model_set[0] = "m1" key2.tar[:] = 0 - assert(np.any(key1.model_set != key2.model_set)) - assert(np.any(key1.tar.toarray() != key2.tar.toarray())) + assert np.any(key1.model_set != key2.model_set) + assert np.any(key1.tar.toarray() != key2.tar.toarray()) # def test_merge(): @@ -56,8 +56,9 @@ def test_copy(): def test_filter(): key1 = create_key() - key2 = SparseTrialKey(key1.model_set[:5], key1.seg_set[:10], - key1.tar[:5,:10], key1.non[:5,:10]) + key2 = SparseTrialKey( + key1.model_set[:5], key1.seg_set[:10], key1.tar[:5, :10], key1.non[:5, :10] + ) key3 = key1.filter(key2.model_set, key2.seg_set, keep=True) assert key2 == key3 @@ -91,11 +92,11 @@ def test_load_save(): # key3 = SparseTrialKey.load(file_h5) # assert key1 == key3 - file_txt = output_dir + '/test.txt' + file_txt = output_dir + "/test.txt" key1.save(file_txt) key2 = SparseTrialKey.load(file_txt) assert key1 == key2 -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_sparse_trial_scores.py b/tests/hyperion/utils/test_sparse_trial_scores.py index effe3b8c..af3ffe8b 100644 --- a/tests/hyperion/utils/test_sparse_trial_scores.py +++ b/tests/hyperion/utils/test_sparse_trial_scores.py @@ -12,22 +12,22 @@ from hyperion.utils.sparse_trial_key import SparseTrialKey from hyperion.utils.sparse_trial_scores import SparseTrialScores -output_dir = './tests/data_out/utils/trial' +output_dir = "./tests/data_out/utils/trial" if not os.path.exists(output_dir): os.makedirs(output_dir) -def create_scores(key_file='./tests/data_in/core-core_det5_key.h5'): +def create_scores(key_file="./tests/data_in/core-core_det5_key.h5"): key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) - scr1 = TrialScores(key.model_set, key.seg_set, - np.random.normal(size=key.tar.shape)*mask, - mask) - print('hola1', np.sum(mask), mask.shape) + scr1 = TrialScores( + key.model_set, key.seg_set, np.random.normal(size=key.tar.shape) * mask, mask + ) + print("hola1", np.sum(mask), mask.shape) scr1 = SparseTrialScores.from_trial_scores(scr1) - print('hola2', np.sum(scr1.score_mask.toarray()), scr1.score_mask.shape) + print("hola2", np.sum(scr1.score_mask.toarray()), scr1.score_mask.shape) key = SparseTrialKey.from_trial_key(key) return scr1, key @@ -35,16 +35,16 @@ def create_scores(key_file='./tests/data_in/core-core_det5_key.h5'): def test_copy_sort_align(): scr1, key = create_scores() - scr2=scr1.copy() + scr2 = scr1.copy() scr2.sort() assert scr2 != scr1 scr3 = scr2.align_with_ndx(key) assert scr1 == scr3 - + scr1.sort() scr2 = scr1.copy() - - scr2.model_set[0] = 'm1' + + scr2.model_set[0] = "m1" scr2.score_mask[:] = False assert np.any(scr1.model_set != scr2.model_set) assert len(scr1.score_mask.data) != len(scr2.score_mask.data) @@ -54,14 +54,14 @@ def test_copy_sort_align(): # scr1 = create_scores()[0] # scr1.sort() - + # scr2 = SparseTrialScores(scr1.model_set[:10], scr1.seg_set, # scr1.scores[:10,:], scr1.score_mask[:10,:]) # scr3 = SparseTrialScores(scr1.model_set[10:], scr1.seg_set, # scr1.scores[10:,:], scr1.score_mask[10:,:]) # scr4 = SparseTrialScores.merge([scr2, scr3]) # assert(scr1 == scr4) - + # scr2 = SparseTrialScores(scr1.model_set, scr1.seg_set[:10], # scr1.scores[:,:10], scr1.score_mask[:,:10]) # scr3 = SparseTrialScores(scr1.model_set, scr1.seg_set[10:], @@ -75,8 +75,12 @@ def test_filter(): scr1 = create_scores()[0] scr1.sort() - scr2 = SparseTrialScores(scr1.model_set[:5], scr1.seg_set[:10], - scr1.scores[:5,:10], scr1.score_mask[:5,:10]) + scr2 = SparseTrialScores( + scr1.model_set[:5], + scr1.seg_set[:10], + scr1.scores[:5, :10], + scr1.score_mask[:5, :10], + ) scr3 = scr1.filter(scr2.model_set, scr2.seg_set, keep=True) assert scr2 == scr3 @@ -100,15 +104,15 @@ def test_transform(): scr1 = create_scores()[0] scr1.sort() - - f = lambda x: 3*x + 1 + + f = lambda x: 3 * x + 1 scr2 = scr1.copy() - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = False + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = False scr4 = scr2.copy() scr4.transform(f) - assert(scr4.scores[0,0] == 3*scr1.scores[0,0] + 1) - assert(scr4.scores[0,1] == scr1.scores[0,1]) + assert scr4.scores[0, 0] == 3 * scr1.scores[0, 0] + 1 + assert scr4.scores[0, 1] == scr1.scores[0, 1] def test_get_tar_non(): @@ -118,17 +122,17 @@ def test_get_tar_non(): scr2 = scr1.align_with_ndx(key) key2 = key.copy() scr2.score_mask[:] = False - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = True - scr2.scores[0,0] = 1 - scr2.scores[0,1] = -1 + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = True + scr2.scores[0, 0] = 1 + scr2.scores[0, 1] = -1 key2.tar[:] = False key2.non[:] = False - key2.tar[0,0] = True - key2.non[0,1] = True + key2.tar[0, 0] = True + key2.non[0, 1] = True [tar, non] = scr2.get_tar_non(key2) - assert np.all(tar==[1]) - assert np.all(non==[-1]) + assert np.all(tar == [1]) + assert np.all(non == [-1]) def test_set_missing_to_value(): @@ -138,36 +142,36 @@ def test_set_missing_to_value(): scr2 = scr1.align_with_ndx(key) key2 = key.copy() scr2.score_mask[:] = False - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = True - scr2.scores[0,0] = 1 - scr2.scores[0,1] = -1 + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = True + scr2.scores[0, 0] = 1 + scr2.scores[0, 1] = -1 key2.tar[:] = False key2.non[:] = False - key2.tar[0,0] = True - key2.non[0,1] = True + key2.tar[0, 0] = True + key2.non[0, 1] = True - scr2.score_mask[0,0] = False + scr2.score_mask[0, 0] = False scr4 = scr2.set_missing_to_value(key2, -10) - assert scr4.scores[0,0] == -10 + assert scr4.scores[0, 0] == -10 def test_load_save(): scr1 = create_scores()[0] scr1.sort() - + # file_h5 = output_dir + '/test.h5' # scr1.save(file_h5) # scr2 = SparseTrialScores.load(file_h5) # assert scr1 == scr2 - - file_txt = output_dir + '/test.txt' + + file_txt = output_dir + "/test.txt" scr1.save(file_txt) scr2 = SparseTrialScores.load(file_txt) - + assert scr1 == scr2 -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_trial_key.py b/tests/hyperion/utils/test_trial_key.py index a972451f..ca33403d 100644 --- a/tests/hyperion/utils/test_trial_key.py +++ b/tests/hyperion/utils/test_trial_key.py @@ -10,12 +10,12 @@ from hyperion.utils.trial_key import TrialKey from hyperion.utils.trial_ndx import TrialNdx -output_dir = './tests/data_out/utils/trial' +output_dir = "./tests/data_out/utils/trial" if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(output_dir) - -def create_key(key_file='./tests/data_in/core-core_det5_key.h5'): + +def create_key(key_file="./tests/data_in/core-core_det5_key.h5"): key = TrialKey.load(key_file) key.sort() @@ -27,50 +27,51 @@ def test_copy(): key1 = create_key() key2 = key1.copy() - key2.model_set[0] = 'm1' + key2.model_set[0] = "m1" key2.tar[:] = 0 - assert(np.any(key1.model_set != key2.model_set)) - assert(np.any(key1.tar != key2.tar)) + assert np.any(key1.model_set != key2.model_set) + assert np.any(key1.tar != key2.tar) def test_merge(): key1 = create_key() - key2 = TrialKey(key1.model_set[:10], key1.seg_set, - key1.tar[:10,:], key1.non[:10,:]) - key3 = TrialKey(key1.model_set[5:], key1.seg_set, - key1.tar[5:,:], key1.non[5:,:]) + key2 = TrialKey( + key1.model_set[:10], key1.seg_set, key1.tar[:10, :], key1.non[:10, :] + ) + key3 = TrialKey(key1.model_set[5:], key1.seg_set, key1.tar[5:, :], key1.non[5:, :]) key4 = TrialKey.merge([key2, key3]) - assert(key1 == key4) + assert key1 == key4 - key2 = TrialKey(key1.model_set, key1.seg_set[:10], - key1.tar[:,:10], key1.non[:,:10]) - key3 = TrialKey(key1.model_set, key1.seg_set[5:], - key1.tar[:,5:], key1.non[:,5:]) + key2 = TrialKey( + key1.model_set, key1.seg_set[:10], key1.tar[:, :10], key1.non[:, :10] + ) + key3 = TrialKey(key1.model_set, key1.seg_set[5:], key1.tar[:, 5:], key1.non[:, 5:]) key4 = TrialKey.merge([key2, key3]) - assert(key1 == key4) + assert key1 == key4 def test_filter(): key1 = create_key() - key2 = TrialKey(key1.model_set[:5], key1.seg_set[:10], - key1.tar[:5,:10], key1.non[:5,:10]) + key2 = TrialKey( + key1.model_set[:5], key1.seg_set[:10], key1.tar[:5, :10], key1.non[:5, :10] + ) key3 = key1.filter(key2.model_set, key2.seg_set, keep=True) - assert(key2 == key3) + assert key2 == key3 def test_split(): key1 = create_key() - num_parts=3 + num_parts = 3 key_list = [] for i in range(num_parts): for j in range(num_parts): - key_ij = key1.split(i+1, num_parts, j+1, num_parts) + key_ij = key1.split(i + 1, num_parts, j + 1, num_parts) key_list.append(key_ij) key2 = TrialKey.merge(key_list) - assert(key1 == key2) + assert key1 == key2 def test_to_ndx(): @@ -80,21 +81,22 @@ def test_to_ndx(): ndx1 = key1.to_ndx() ndx1.validate() + def test_load_save(): key1 = create_key() - file_h5 = output_dir + '/test.h5' + file_h5 = output_dir + "/test.h5" key1.save(file_h5) key3 = TrialKey.load(file_h5) - assert(key1 == key3) + assert key1 == key3 - file_txt = output_dir + '/test.txt' + file_txt = output_dir + "/test.txt" key3.tar[0, :] = True key3.non[:, 0] = True key3.save(file_txt) key2 = TrialKey.load(file_txt) - assert(key3 == key2) + assert key3 == key2 -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_trial_ndx.py b/tests/hyperion/utils/test_trial_ndx.py index 0e27c376..3fe3942e 100644 --- a/tests/hyperion/utils/test_trial_ndx.py +++ b/tests/hyperion/utils/test_trial_ndx.py @@ -9,12 +9,12 @@ from hyperion.utils.trial_ndx import TrialNdx -output_dir = './tests/data_out/utils/trial' +output_dir = "./tests/data_out/utils/trial" if not os.path.exists(output_dir): - os.makedirs(output_dir) + os.makedirs(output_dir) - -def create_ndx(ndx_file='./tests/data_in/core-core_det5_ndx.h5'): + +def create_ndx(ndx_file="./tests/data_in/core-core_det5_ndx.h5"): ndx = TrialNdx.load(ndx_file) ndx.sort() @@ -26,68 +26,63 @@ def test_copy(): ndx1 = create_ndx() ndx2 = ndx1.copy() - ndx2.model_set[0] = 'm1' + ndx2.model_set[0] = "m1" ndx2.trial_mask[:] = 0 - assert(np.any(ndx1.model_set != ndx2.model_set)) - assert(np.any(ndx1.trial_mask != ndx2.trial_mask)) + assert np.any(ndx1.model_set != ndx2.model_set) + assert np.any(ndx1.trial_mask != ndx2.trial_mask) def test_merge(): ndx1 = create_ndx() - ndx2 = TrialNdx(ndx1.model_set[:10], ndx1.seg_set, - ndx1.trial_mask[:10,:]) - ndx3 = TrialNdx(ndx1.model_set[5:], ndx1.seg_set, - ndx1.trial_mask[5:,:]) + ndx2 = TrialNdx(ndx1.model_set[:10], ndx1.seg_set, ndx1.trial_mask[:10, :]) + ndx3 = TrialNdx(ndx1.model_set[5:], ndx1.seg_set, ndx1.trial_mask[5:, :]) ndx4 = TrialNdx.merge([ndx2, ndx3]) - assert(ndx1 == ndx4) + assert ndx1 == ndx4 - ndx2 = TrialNdx(ndx1.model_set, ndx1.seg_set[:10], - ndx1.trial_mask[:,:10]) - ndx3 = TrialNdx(ndx1.model_set, ndx1.seg_set[5:], - ndx1.trial_mask[:,5:]) + ndx2 = TrialNdx(ndx1.model_set, ndx1.seg_set[:10], ndx1.trial_mask[:, :10]) + ndx3 = TrialNdx(ndx1.model_set, ndx1.seg_set[5:], ndx1.trial_mask[:, 5:]) ndx4 = TrialNdx.merge([ndx2, ndx3]) - assert(ndx1 == ndx4) + assert ndx1 == ndx4 def test_filter(): ndx1 = create_ndx() - ndx2 = TrialNdx(ndx1.model_set[:5], ndx1.seg_set[:10], - ndx1.trial_mask[:5,:10]) + ndx2 = TrialNdx(ndx1.model_set[:5], ndx1.seg_set[:10], ndx1.trial_mask[:5, :10]) ndx3 = ndx1.filter(ndx2.model_set, ndx2.seg_set, keep=True) - assert(ndx2 == ndx3) + assert ndx2 == ndx3 def test_split(): ndx1 = create_ndx() - - num_parts=3 + + num_parts = 3 ndx_list = [] for i in range(num_parts): for j in range(num_parts): - ndx_ij = ndx1.split(i+1, num_parts, j+1, num_parts) + ndx_ij = ndx1.split(i + 1, num_parts, j + 1, num_parts) ndx_list.append(ndx_ij) ndx2 = TrialNdx.merge(ndx_list) - assert(ndx1 == ndx2) + assert ndx1 == ndx2 def test_load_save(): ndx1 = create_ndx() - file_h5 = output_dir + '/test.h5' + file_h5 = output_dir + "/test.h5" ndx1.save(file_h5) ndx3 = TrialNdx.load(file_h5) - assert(ndx1 == ndx3) - - file_txt = output_dir + '/test.txt' + assert ndx1 == ndx3 + + file_txt = output_dir + "/test.txt" ndx3.trial_mask[0, :] = True ndx3.trial_mask[:, 0] = True ndx3.save(file_txt) ndx2 = TrialNdx.load(file_txt) - assert(ndx3 == ndx2) + assert ndx3 == ndx2 -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_trial_scores.py b/tests/hyperion/utils/test_trial_scores.py index cb075aa8..96eef7ad 100644 --- a/tests/hyperion/utils/test_trial_scores.py +++ b/tests/hyperion/utils/test_trial_scores.py @@ -11,58 +11,62 @@ from hyperion.utils.trial_ndx import TrialNdx from hyperion.utils.trial_scores import TrialScores -output_dir = './tests/data_out/utils/trial' +output_dir = "./tests/data_out/utils/trial" if not os.path.exists(output_dir): os.makedirs(output_dir) -def create_scores(key_file='./tests/data_in/core-core_det5_key.h5'): +def create_scores(key_file="./tests/data_in/core-core_det5_key.h5"): key = TrialKey.load(key_file) mask = np.logical_or(key.tar, key.non) - scr1 = TrialScores(key.model_set, key.seg_set, - np.random.normal(size=key.tar.shape)*mask, - mask) + scr1 = TrialScores( + key.model_set, key.seg_set, np.random.normal(size=key.tar.shape) * mask, mask + ) return scr1, key def test_copy_sort_align(): scr1, key = create_scores() - scr2=scr1.copy() + scr2 = scr1.copy() scr2.sort() - assert(scr2 != scr1) + assert scr2 != scr1 scr3 = scr2.align_with_ndx(key) - assert(scr1 == scr3) - + assert scr1 == scr3 + scr1.sort() scr2 = scr1.copy() - - scr2.model_set[0] = 'm1' + + scr2.model_set[0] = "m1" scr2.score_mask[:] = 0 - assert(np.any(scr1.model_set != scr2.model_set)) - assert(np.any(scr1.score_mask != scr2.score_mask)) + assert np.any(scr1.model_set != scr2.model_set) + assert np.any(scr1.score_mask != scr2.score_mask) def test_merge(): scr1 = create_scores()[0] scr1.sort() - - scr2 = TrialScores(scr1.model_set[:10], scr1.seg_set, - scr1.scores[:10,:], scr1.score_mask[:10,:]) - scr3 = TrialScores(scr1.model_set[10:], scr1.seg_set, - scr1.scores[10:,:], scr1.score_mask[10:,:]) + + scr2 = TrialScores( + scr1.model_set[:10], scr1.seg_set, scr1.scores[:10, :], scr1.score_mask[:10, :] + ) + scr3 = TrialScores( + scr1.model_set[10:], scr1.seg_set, scr1.scores[10:, :], scr1.score_mask[10:, :] + ) scr4 = TrialScores.merge([scr2, scr3]) - assert(scr1 == scr4) - - scr2 = TrialScores(scr1.model_set, scr1.seg_set[:10], - scr1.scores[:,:10], scr1.score_mask[:,:10]) - scr3 = TrialScores(scr1.model_set, scr1.seg_set[10:], - scr1.scores[:,10:], scr1.score_mask[:,10:]) + assert scr1 == scr4 + + scr2 = TrialScores( + scr1.model_set, scr1.seg_set[:10], scr1.scores[:, :10], scr1.score_mask[:, :10] + ) + scr3 = TrialScores( + scr1.model_set, scr1.seg_set[10:], scr1.scores[:, 10:], scr1.score_mask[:, 10:] + ) scr4 = TrialScores.merge([scr2, scr3]) - assert(scr1 == scr4) + assert scr1 == scr4 def test_filter(): @@ -70,10 +74,14 @@ def test_filter(): scr1 = create_scores()[0] scr1.sort() - scr2 = TrialScores(scr1.model_set[:5], scr1.seg_set[:10], - scr1.scores[:5,:10], scr1.score_mask[:5,:10]) + scr2 = TrialScores( + scr1.model_set[:5], + scr1.seg_set[:10], + scr1.scores[:5, :10], + scr1.score_mask[:5, :10], + ) scr3 = scr1.filter(scr2.model_set, scr2.seg_set, keep=True) - assert(scr2 == scr3) + assert scr2 == scr3 def test_split(): @@ -81,29 +89,29 @@ def test_split(): scr1 = create_scores()[0] scr1.sort() - num_parts=3 + num_parts = 3 scr_list = [] for i in range(num_parts): for j in range(num_parts): - scr_ij = scr1.split(i+1, num_parts, j+1, num_parts) + scr_ij = scr1.split(i + 1, num_parts, j + 1, num_parts) scr_list.append(scr_ij) scr2 = TrialScores.merge(scr_list) - assert(scr1 == scr2) + assert scr1 == scr2 def test_transform(): scr1 = create_scores()[0] scr1.sort() - - f = lambda x: 3*x + 1 + + f = lambda x: 3 * x + 1 scr2 = scr1.copy() - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = False + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = False scr4 = scr2.copy() scr4.transform(f) - assert(scr4.scores[0,0] == 3*scr1.scores[0,0] + 1) - assert(scr4.scores[0,1] == scr1.scores[0,1]) + assert scr4.scores[0, 0] == 3 * scr1.scores[0, 0] + 1 + assert scr4.scores[0, 1] == scr1.scores[0, 1] def test_get_tar_non(): @@ -113,17 +121,17 @@ def test_get_tar_non(): scr2 = scr1.align_with_ndx(key) key2 = key.copy() scr2.score_mask[:] = False - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = True - scr2.scores[0,0] = 1 - scr2.scores[0,1] = -1 + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = True + scr2.scores[0, 0] = 1 + scr2.scores[0, 1] = -1 key2.tar[:] = False key2.non[:] = False - key2.tar[0,0] = True - key2.non[0,1] = True + key2.tar[0, 0] = True + key2.non[0, 1] = True [tar, non] = scr2.get_tar_non(key2) - assert(np.all(tar==[1])) - assert(np.all(non==[-1])) + assert np.all(tar == [1]) + assert np.all(non == [-1]) def test_set_missing_to_value(): @@ -133,40 +141,40 @@ def test_set_missing_to_value(): scr2 = scr1.align_with_ndx(key) key2 = key.copy() scr2.score_mask[:] = False - scr2.score_mask[0,0] = True - scr2.score_mask[0,1] = True - scr2.scores[0,0] = 1 - scr2.scores[0,1] = -1 + scr2.score_mask[0, 0] = True + scr2.score_mask[0, 1] = True + scr2.scores[0, 0] = 1 + scr2.scores[0, 1] = -1 key2.tar[:] = False key2.non[:] = False - key2.tar[0,0] = True - key2.non[0,1] = True + key2.tar[0, 0] = True + key2.non[0, 1] = True - scr2.score_mask[0,0] = False + scr2.score_mask[0, 0] = False scr4 = scr2.set_missing_to_value(key2, -10) - assert(scr4.scores[0,0] == -10) + assert scr4.scores[0, 0] == -10 def test_load_save(): scr1 = create_scores()[0] scr1.sort() - - file_h5 = output_dir + '/test.h5' + + file_h5 = output_dir + "/test.h5" scr1.save(file_h5) scr2 = TrialScores.load(file_h5) - assert(scr1 == scr2) - - file_txt = output_dir + '/test.txt' + assert scr1 == scr2 + + file_txt = output_dir + "/test.txt" scr1.score_mask[:, :] = False scr1.score_mask[0, :] = True scr1.score_mask[:, 0] = True - scr1.scores[scr1.score_mask==False]=0 + scr1.scores[scr1.score_mask == False] = 0 scr1.save(file_txt) scr2 = TrialScores.load(file_txt) - - assert(scr1 == scr2) + + assert scr1 == scr2 -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_utt2info.py b/tests/hyperion/utils/test_utt2info.py index ddebedcb..08e02a77 100644 --- a/tests/hyperion/utils/test_utt2info.py +++ b/tests/hyperion/utils/test_utt2info.py @@ -9,35 +9,34 @@ from hyperion.utils.utt2info import Utt2Info -output_dir = './tests/data_out/utils/utt2info' +output_dir = "./tests/data_out/utils/utt2info" if not os.path.exists(output_dir): - os.makedirs(output_dir) - + os.makedirs(output_dir) + def create_utt2info(): - - spk_ids = ['spk1']+['spk2']*2+['spk3']*3+['spk10']*10 - key = np.arange(len(spk_ids)).astype('U') + + spk_ids = ["spk1"] + ["spk2"] * 2 + ["spk3"] * 3 + ["spk10"] * 10 + key = np.arange(len(spk_ids)).astype("U") u2i = Utt2Info.create(key, spk_ids) u2i.sort(field=1) return u2i - def test_cmp(): u2i1 = create_utt2info() u2i2 = create_utt2info() assert u2i1 == u2i2 - u2i1 = Utt2Info.create([],[]) - u2i2 = Utt2Info.create([],[]) + u2i1 = Utt2Info.create([], []) + u2i2 = Utt2Info.create([], []) assert u2i1 == u2i2 def test_sort(): u2i = create_utt2info() - spk_ids = ['spk1']+['spk10']*10+['spk2']*2+['spk3']*3 + spk_ids = ["spk1"] + ["spk10"] * 10 + ["spk2"] * 2 + ["spk3"] * 3 assert np.all(u2i.info == np.asarray(spk_ids)) @@ -49,46 +48,43 @@ def test_len(): def test_get_index(): u2i = create_utt2info() - assert u2i.get_index('1') == 11 - + assert u2i.get_index("1") == 11 + def test_contains(): u2i = create_utt2info() - assert '10' in u2i - assert not('100' in u2i) - + assert "10" in u2i + assert not ("100" in u2i) def test_getitem(): u2i1 = create_utt2info() - assert u2i1[1][1] == 'spk10' + assert u2i1[1][1] == "spk10" - assert u2i1['0'] == 'spk1' + assert u2i1["0"] == "spk1" - def test_save_load(): - file_txt = output_dir + '/utt2spk' + file_txt = output_dir + "/utt2spk" u2i1 = create_utt2info() u2i1.save(file_txt) u2i2 = Utt2Info.load(file_txt) print(u2i1.utt_info) print(u2i2.utt_info) - print(np.asarray(u2i1.utt_info['key']) == np.asarray(u2i2.utt_info['key'])) - print(np.asarray(u2i1.utt_info['key'])) - print(np.asarray(u2i2.utt_info['key'])) + print(np.asarray(u2i1.utt_info["key"]) == np.asarray(u2i2.utt_info["key"])) + print(np.asarray(u2i1.utt_info["key"])) + print(np.asarray(u2i2.utt_info["key"])) assert u2i1 == u2i2 - - + def test_split_merge(): u2i1 = create_utt2info() - num_parts=3 + num_parts = 3 u2i_list = [] for i in range(num_parts): - u2i_i = u2i1.split(i+1, num_parts, group_by_field=1) + u2i_i = u2i1.split(i + 1, num_parts, group_by_field=1) u2i_list.append(u2i_i) assert u2i_list[0].len() == 1 @@ -99,33 +95,31 @@ def test_split_merge(): assert u2i1 == u2i2 - def test_filter(): - filter_key = ['0', '1', '2'] + filter_key = ["0", "1", "2"] u2i1 = create_utt2info() u2i2 = u2i1.filter(filter_key) - + idx = [0, 11, 12] u2i3 = Utt2Info.create(u2i1.key[idx], u2i1.info[idx]) - + assert u2i2 == u2i3 - def test_filter_info(): - filter_key = ['spk2', 'spk10'] + filter_key = ["spk2", "spk10"] u2i1 = create_utt2info() u2i2 = u2i1.filter_info(filter_key) - - f = np.zeros(len(u2i1.key), dtype='bool') + + f = np.zeros(len(u2i1.key), dtype="bool") f[1:13] = True u2i3 = Utt2Info.create(u2i1.key[f], u2i1.info[f]) print(u2i2.utt_info) print(u2i3.utt_info) - + assert u2i2 == u2i3 @@ -134,14 +128,13 @@ def test_filter_index(): u2i1 = create_utt2info() u2i2 = u2i1.filter_index(filter_key) - + idx = [0, 11, 12] u2i3 = Utt2Info.create(u2i1.key[idx], u2i1.info[idx]) assert u2i2 == u2i3 - def test_shuffle(): u2i1 = create_utt2info() @@ -152,6 +145,5 @@ def test_shuffle(): assert u2i1 == u2i2 - -if __name__ == '__main__': +if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/hyperion/utils/test_vad_utils.py b/tests/hyperion/utils/test_vad_utils.py index abac75bb..a284c9e0 100644 --- a/tests/hyperion/utils/test_vad_utils.py +++ b/tests/hyperion/utils/test_vad_utils.py @@ -12,18 +12,11 @@ def test_merge_vad_timestamps(): - t_in = np.asarray([ - [0.01, 3.4], - [1.01, 2.3], - [2.50, 3.7], - [5.1, 6.3], - [7, 8], - [7.5, 9]]) - - t_target = np.asarray([ - [0.01, 3.7], - [5.1, 6.3], - [7, 9]]) + t_in = np.asarray( + [[0.01, 3.4], [1.01, 2.3], [2.50, 3.7], [5.1, 6.3], [7, 8], [7.5, 9]] + ) + + t_target = np.asarray([[0.01, 3.7], [5.1, 6.3], [7, 9]]) t_out = merge_vad_timestamps(t_in) @@ -32,14 +25,13 @@ def test_merge_vad_timestamps(): def test_bin_vad_to_timestamps(): - - vad = np.asarray( - [1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1], - dtype=np.bool) - t_target = np.asarray([ - [0, 25], - [3*10, 8*10+25], - [12*10, 13*10+25]], dtype=np.float) - (25.-10.)/2 + vad = np.asarray([1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1], dtype=np.bool) + t_target = ( + np.asarray( + [[0, 25], [3 * 10, 8 * 10 + 25], [12 * 10, 13 * 10 + 25]], dtype=np.float + ) + - (25.0 - 10.0) / 2 + ) t_target[0, 0] = 0 t_out = bin_vad_to_timestamps(vad, 25, 10) @@ -48,12 +40,10 @@ def test_bin_vad_to_timestamps(): def test_bin_vad_to_timestamps_snipedges(): - vad = np.asarray( - [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1], - dtype=np.bool) - t_target = np.asarray([ - [3*10, 8*10+25], - [12*10, 13*10+25]], dtype=np.float) + vad = np.asarray([0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1], dtype=np.bool) + t_target = np.asarray( + [[3 * 10, 8 * 10 + 25], [12 * 10, 13 * 10 + 25]], dtype=np.float + ) t_out = bin_vad_to_timestamps(vad, 25, 10, snip_edges=True) assert_allclose(t_out, t_target) @@ -62,161 +52,138 @@ def test_bin_vad_to_timestamps_snipedges(): def test_vad_timestamps_to_bin(): vad_target = np.asarray( - [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1], - dtype=np.bool) - t_in = np.asarray([ - [0, 25], - [3*10, 8*10+25], - [12*10, 13*10+25]], dtype=np.float) - (25.-10.)/2 + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1], dtype=np.bool + ) + t_in = ( + np.asarray( + [[0, 25], [3 * 10, 8 * 10 + 25], [12 * 10, 13 * 10 + 25]], dtype=np.float + ) + - (25.0 - 10.0) / 2 + ) t_in[0, 0] = 0 vad_out = vad_timestamps_to_bin(t_in, 25, 10) assert_allclose(vad_out, vad_target) def test_vad_timestamps_to_bin_snipedges(): - - vad_target = np.asarray( - [0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1], - dtype=np.bool) - t_in = np.asarray([ - [3*10, 7*10+25], - [12*10, 12*10+25]], dtype=np.float) - + + vad_target = np.asarray([0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1], dtype=np.bool) + t_in = np.asarray([[3 * 10, 7 * 10 + 25], [12 * 10, 12 * 10 + 25]], dtype=np.float) + vad_out = vad_timestamps_to_bin(t_in, 25, 10, snip_edges=True) assert_allclose(vad_out, vad_target) - vad_target = np.asarray( - [1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1], - dtype=np.bool) - t_in = np.asarray([ - [0, 13], - [3*10, 7*10+25], - [12*10, 12*10+25]], dtype=np.float) - + vad_target = np.asarray([1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1], dtype=np.bool) + t_in = np.asarray( + [[0, 13], [3 * 10, 7 * 10 + 25], [12 * 10, 12 * 10 + 25]], dtype=np.float + ) + vad_out = vad_timestamps_to_bin(t_in, 25, 10, snip_edges=True) assert_allclose(vad_out, vad_target) - def test_timestamps_wrt_vad_to_absolute_timestamps(): - vad = np.asarray([ - [1.0, 2.0], - [4.0, 5.0], - [6.0, 7.0]]) + vad = np.asarray([[1.0, 2.0], [4.0, 5.0], [6.0, 7.0]]) - t_in = np.asarray([ - [0.5, 1.0], - [1.25, 1.5], - [1.75, 2.75]]) + t_in = np.asarray([[0.5, 1.0], [1.25, 1.5], [1.75, 2.75]]) - t_target = np.asarray([ - [1.5, 2.0], - [4.25, 4.50], - [4.75, 5.0], - [6.0, 6.75]]) + t_target = np.asarray([[1.5, 2.0], [4.25, 4.50], [4.75, 5.0], [6.0, 6.75]]) - t_out = timestamps_wrt_vad_to_absolute_timestamps( - t_in, vad) + t_out = timestamps_wrt_vad_to_absolute_timestamps(t_in, vad) assert_allclose(t_out, t_target) - def test_timestamps_wrt_bin_vad_to_absolute_timestamps(): - vad = np.asarray([ - [1.0, 2.0], - [4.0, 5.0], - [6.0, 7.0]]) + vad = np.asarray([[1.0, 2.0], [4.0, 5.0], [6.0, 7.0]]) vad = vad_timestamps_to_bin(vad, 0.025, 0.010) - t_in = np.asarray([ - [0.5, 1.0], - [1.25, 1.5], - [1.75, 2.75]]) + t_in = np.asarray([[0.5, 1.0], [1.25, 1.5], [1.75, 2.75]]) - t_target = np.asarray([ - [1.5, 2.0], - [4.25, 4.50], - [4.75, 5.0], - [6.0, 6.75]]) + t_target = np.asarray([[1.5, 2.0], [4.25, 4.50], [4.75, 5.0], [6.0, 6.75]]) - t_out = timestamps_wrt_bin_vad_to_absolute_timestamps( - t_in, vad, 0.025, 0.010) + t_out = timestamps_wrt_bin_vad_to_absolute_timestamps(t_in, vad, 0.025, 0.010) assert_allclose(t_out, t_target, atol=0.05) - def test_intersect_segment_timestamps_with_vad(): - t_in = np.asarray([ - [0, 0.75], - [0, 1.0], - [0, 1.25], - [0, 1.5], - [0.25, 1.75], - [0.5, 2.0], - [0.75, 2.25], - [1.0, 2.5], - [1.25, 2.75], - [1.5, 3.0], - [1.75, 3.25], - [2.0, 3.5], - [2.25, 3.75], - [2.5, 4.0], - [2.75, 4.25], - [3.0, 4.5], - [3.25, 4.75], - [3.5, 5.0], - [3.75, 5.25], - [4.0, 5.5], - [4.25, 5.75], - [4.5, 6.0], - [4.75, 6.25], - [5.0, 6.5]]) - - vad = np.asarray([ - [1.3, 2.0], - [2.1, 2.7], - [5.0, 6.0]]) + t_in = np.asarray( + [ + [0, 0.75], + [0, 1.0], + [0, 1.25], + [0, 1.5], + [0.25, 1.75], + [0.5, 2.0], + [0.75, 2.25], + [1.0, 2.5], + [1.25, 2.75], + [1.5, 3.0], + [1.75, 3.25], + [2.0, 3.5], + [2.25, 3.75], + [2.5, 4.0], + [2.75, 4.25], + [3.0, 4.5], + [3.25, 4.75], + [3.5, 5.0], + [3.75, 5.25], + [4.0, 5.5], + [4.25, 5.75], + [4.5, 6.0], + [4.75, 6.25], + [5.0, 6.5], + ] + ) + + vad = np.asarray([[1.3, 2.0], [2.1, 2.7], [5.0, 6.0]]) speech_target = np.ones((t_in.shape[0],), dtype=np.bool) speech_target[:3] = False speech_target[14:18] = False - t_target = np.asarray([ - [1.3, 1.5], - [1.3, 1.75], - [1.3, 2.0], - [1.3, 2], [2.1, 2.25], - [1.3, 2.0], [2.1, 2.5], - [1.3, 2.0], [2.1, 2.7], - [1.5, 2.0], [2.1, 2.7], - [1.75, 2.0], [2.1, 2.7], - [2.1, 2.7], - [2.25, 2.7], - [2.5, 2.7], - [5.0, 5.25], - [5.0, 5.5], - [5.0, 5.75], - [5.0, 6.0], - [5.0, 6.0], - [5.0, 6.0]]) - - out2speech_target = np.asarray([ - 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, - 8, 9, 10, 11, 12, 13, 14, 15, 16], dtype=np.int) - - speech_idx, t_out, out2speech_idx = intersect_segment_timestamps_with_vad( - t_in, vad) + t_target = np.asarray( + [ + [1.3, 1.5], + [1.3, 1.75], + [1.3, 2.0], + [1.3, 2], + [2.1, 2.25], + [1.3, 2.0], + [2.1, 2.5], + [1.3, 2.0], + [2.1, 2.7], + [1.5, 2.0], + [2.1, 2.7], + [1.75, 2.0], + [2.1, 2.7], + [2.1, 2.7], + [2.25, 2.7], + [2.5, 2.7], + [5.0, 5.25], + [5.0, 5.5], + [5.0, 5.75], + [5.0, 6.0], + [5.0, 6.0], + [5.0, 6.0], + ] + ) + + out2speech_target = np.asarray( + [0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], + dtype=np.int, + ) + + speech_idx, t_out, out2speech_idx = intersect_segment_timestamps_with_vad(t_in, vad) assert_allclose(speech_idx, speech_target) assert_allclose(t_out, t_target) assert_allclose(out2speech_idx, out2speech_idx) - -if __name__ == '__main__': - pytest.main([__file__]) +if __name__ == "__main__": + pytest.main([__file__])