From d773ab9e467353448dc155ea9be379ab1dda48f4 Mon Sep 17 00:00:00 2001 From: naxingyu Date: Mon, 3 Aug 2015 11:02:21 +0800 Subject: [PATCH 1/7] add Convolution component in nnet2 --- src/nnet2/nnet-component-test.cc | 39 +++ src/nnet2/nnet-component.cc | 440 +++++++++++++++++++++++++++++++ src/nnet2/nnet-component.h | 64 +++++ 3 files changed, 543 insertions(+) diff --git a/src/nnet2/nnet-component-test.cc b/src/nnet2/nnet-component-test.cc index 94248b242b3..dd84b43aa02 100644 --- a/src/nnet2/nnet-component-test.cc +++ b/src/nnet2/nnet-component-test.cc @@ -337,6 +337,44 @@ void UnitTestAffineComponent() { } } +void UnitTestConvolutionComponent() { + BaseFloat learning_rate = 0.01, + param_stddev = 0.1, bias_stddev = 1.0; + int32 patch_stride = 10, patch_step = 1, patch_dim = 4; + int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step; + int32 num_splice = 5 + Rand() % 10, num_filters = 5 + Rand() % 10; + int32 input_dim = patch_stride * num_splice; + int32 filter_dim = patch_dim * num_splice; + int32 output_dim = num_patches * num_filters; + { + ConvolutionComponent component; + if (Rand() % 2 == 0) { + component.Init(learning_rate, input_dim, output_dim, + patch_dim, patch_step, patch_stride, + param_stddev, bias_stddev); + } else { + // initialize the hyper-parameters + component.Init(learning_rate, input_dim, output_dim, + patch_dim, patch_step, patch_stride, + param_stddev, bias_stddev); + Matrix mat(num_filters, filter_dim + 1); + mat.SetRandn(); + mat.Scale(param_stddev); + WriteKaldiObject(mat, "tmpf", true); + Sleep(0.5); + component.Init(learning_rate, "tmpf"); + unlink("tmpf"); + } + UnitTestGenericComponentInternal(component); + } + { + const char *str = "learning-rate=0.01 input-dim=100 output-dim=70 param-stddev=0.1 patch-dim=4 patch-step=1 patch-stride=10"; + ConvolutionComponent component; + component.InitFromString(str); + UnitTestGenericComponentInternal(component); + } +} + void UnitTestDropoutComponent() { // We're testing that the gradients are computed correctly: // the input gradients and the model gradients. @@ -826,6 +864,7 @@ int main() { UnitTestFixedBiasComponent(); UnitTestAffineComponentPreconditioned(); UnitTestAffineComponentPreconditionedOnline(); + UnitTestConvolutionComponent(); UnitTestDropoutComponent(); UnitTestAdditiveNoiseComponent(); UnitTestParsing(); diff --git a/src/nnet2/nnet-component.cc b/src/nnet2/nnet-component.cc index b788d40d5dc..c4d486d080e 100644 --- a/src/nnet2/nnet-component.cc +++ b/src/nnet2/nnet-component.cc @@ -102,6 +102,8 @@ Component* Component::NewComponentOfType(const std::string &component_type) { ans = new DropoutComponent(); } else if (component_type == "AdditiveNoiseComponent") { ans = new AdditiveNoiseComponent(); + } else if (component_type == "ConvolutionComponent") { + ans = new ConvolutionComponent(); } return ans; } @@ -3672,5 +3674,443 @@ void AdditiveNoiseComponent::Propagate(const ChunkInfo &in_info, out->AddMat(stddev_, rand); } +ConvolutionComponent::ConvolutionComponent(): + UpdatableComponent(), + patch_dim_(0), patch_step_(0), patch_stride_(0), is_gradient_(false) {} + +ConvolutionComponent::ConvolutionComponent(const ConvolutionComponent &component): + UpdatableComponent(component), + filter_params_(component.filter_params_), + bias_params_(component.bias_params_), + is_gradient_(component.is_gradient_) {} + +ConvolutionComponent::ConvolutionComponent(const CuMatrixBase &filter_params, + const CuVectorBase &bias_params, + BaseFloat learning_rate): + UpdatableComponent(learning_rate), + filter_params_(filter_params), + bias_params_(bias_params) { + KALDI_ASSERT(filter_params.NumRows() == bias_params.Dim() && + bias_params.Dim() != 0); + is_gradient_ = false; +} + +// aquire input dim +int32 ConvolutionComponent::InputDim() const { + int32 filter_dim = filter_params_.NumCols(); + int32 num_splice = filter_dim / patch_dim_; + return patch_stride_ * num_splice; +} + +// aquire output dim +int32 ConvolutionComponent::OutputDim() const { + int32 num_filters = filter_params_.NumRows(); + int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; + return num_patches * num_filters; +} + +// initialize the component using hyperparameters +void ConvolutionComponent::Init(BaseFloat learning_rate, + int32 input_dim, int32 output_dim, + int32 patch_dim, int32 patch_step, int32 patch_stride, + BaseFloat param_stddev, BaseFloat bias_stddev) { + UpdatableComponent::Init(learning_rate); + patch_dim_ = patch_dim; + patch_step_ = patch_step; + patch_stride_ = patch_stride; + int32 num_splice = input_dim / patch_stride; + int32 filter_dim = num_splice * patch_dim; + int32 num_patches = 1 + (patch_stride - patch_dim) / patch_step; + int32 num_filters = output_dim / num_patches; + KALDI_ASSERT(input_dim % patch_stride == 0); + KALDI_ASSERT((patch_stride - patch_dim) % patch_step == 0); + KALDI_ASSERT(output_dim % num_patches == 0); + + filter_params_.Resize(num_filters, filter_dim); + bias_params_.Resize(num_filters); + KALDI_ASSERT(param_stddev >= 0.0 && bias_stddev >= 0.0); + filter_params_.SetRandn(); + filter_params_.Scale(param_stddev); + bias_params_.SetRandn(); + bias_params_.Scale(bias_stddev); +} + +// initialize the component using predefined matrix file +void ConvolutionComponent::Init(BaseFloat learning_rate, + std::string matrix_filename) { + UpdatableComponent::Init(learning_rate); + CuMatrix mat; + ReadKaldiObject(matrix_filename, &mat); + KALDI_ASSERT(mat.NumCols() >= 2); + int32 filter_dim = mat.NumCols() - 1, num_filters = mat.NumRows(); + filter_params_.Resize(num_filters, filter_dim); + bias_params_.Resize(num_filters); + filter_params_.CopyFromMat(mat.Range(0, num_filters, 0, filter_dim)); + bias_params_.CopyColFromMat(mat, filter_dim); +} + +// resize the component, setting the parameters to zero, while +// leaving any other configuration values the same +void ConvolutionComponent::Resize(int32 input_dim, int32 output_dim) { + KALDI_ASSERT(input_dim > 0 && output_dim > 0); + int32 num_splice = input_dim / patch_stride_; + int32 filter_dim = num_splice * patch_dim_; + int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; + int32 num_filters = output_dim / num_patches; + KALDI_ASSERT(input_dim % patch_stride_ == 0); + KALDI_ASSERT((patch_stride_ - patch_dim_) % patch_step_ == 0); + KALDI_ASSERT(output_dim % num_patches == 0); + filter_params_.Resize(num_filters, filter_dim); + bias_params_.Resize(num_filters); +} + +// display information about component +std::string ConvolutionComponent::Info() const { + std::stringstream stream; + BaseFloat filter_params_size = static_cast(filter_params_.NumRows()) + * static_cast(filter_params_.NumCols()); + BaseFloat filter_stddev = + std::sqrt(TraceMatMat(filter_params_, filter_params_, kTrans) / + filter_params_size), + bias_stddev = std::sqrt(VecVec(bias_params_, bias_params_) / + bias_params_.Dim()); + + int32 num_splice = InputDim() / patch_stride_; + int32 filter_dim = num_splice * patch_dim_; + int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; + int32 num_filters = OutputDim() / num_patches; + + stream << Type() << ", input-dim=" << InputDim() + << ", output-dim=" << OutputDim() + << ", num-splice=" << num_splice + << ", num-patches=" << num_patches + << ", num-filters=" << num_filters + << ", filter-dim=" << filter_dim + << ", filter-params-stddev=" << filter_stddev + << ", bias-params-stddev=" << bias_stddev + << ", learning-rate=" << LearningRate(); + return stream.str(); +} + +// initialize the component using configuration file +void ConvolutionComponent::InitFromString(std::string args) { + std::string orig_args(args); + bool ok = true; + BaseFloat learning_rate = learning_rate_; + std::string matrix_filename; + int32 input_dim = -1, output_dim = -1; + int32 patch_dim = -1, patch_step = -1, patch_stride = -1; + ParseFromString("learning-rate", &args, &learning_rate); + if (ParseFromString("matrix", &args, &matrix_filename)) { + // initialize from prefined parameter matrix + Init(learning_rate, matrix_filename); + if (ParseFromString("input-dim", &args, &input_dim)) + KALDI_ASSERT(input_dim == InputDim() && + "input-dim mismatch vs. matrix."); + if (ParseFromString("output-dim", &args, &output_dim)) + KALDI_ASSERT(output_dim == OutputDim() && + "output-dim mismatch vs. matrix."); + } else { + // initialize from configuration + ok = ok && ParseFromString("input-dim", &args, &input_dim); + ok = ok && ParseFromString("output-dim", &args, &output_dim); + ok = ok && ParseFromString("patch-dim", &args, &patch_dim); + ok = ok && ParseFromString("patch-step", &args, &patch_step); + ok = ok && ParseFromString("patch-stride", &args, &patch_stride); + BaseFloat param_stddev = 1.0 / std::sqrt(input_dim), bias_stddev = 1.0; + ParseFromString("param-stddev", &args, ¶m_stddev); + ParseFromString("bias-stddev", &args, &bias_stddev); + Init(learning_rate, input_dim, output_dim, + patch_dim, patch_step, patch_stride, param_stddev, bias_stddev); + } + if (!args.empty()) + KALDI_ERR << "Could not process these elements in initializer: " << args; + if (!ok) + KALDI_ERR << "Bad initializer " << orig_args; +} + +// propagation function +void ConvolutionComponent::Propagate(const ChunkInfo &in_info, + const ChunkInfo &out_info, + const CuMatrixBase &in, + CuMatrixBase *out) const { + in_info.CheckSize(in); + out_info.CheckSize(*out); + KALDI_ASSERT(in_info.NumChunks() == out_info.NumChunks()); + + // dims + int32 num_splice = InputDim() / patch_stride_; + int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; + int32 num_filters = filter_params_.NumRows(); + int32 num_frames = in.NumRows(); + int32 filter_dim = filter_params_.NumCols(); + + /** Buffer of reshaped inputs: + * 1row = vectorized rectangular feature patch, + * 1col = dim over speech frames, + * std::vector-dim = patch-position + */ + std::vector > vectorized_feature_patches_; + + // prepare the buffers + if (vectorized_feature_patches_.size() == 0) { + vectorized_feature_patches_.resize(num_patches); + } + + // vectorize the inputs + for (int32 p = 0; p < num_patches; p++) { + vectorized_feature_patches_[p].Resize(num_frames, filter_dim, kSetZero); + // build-up a column selection mask: + std::vector column_mask; + for (int32 s = 0; s < num_splice; s++) { + for (int32 d = 0; d < patch_dim_; d++) { + column_mask.push_back(p * patch_step_ + s * patch_stride_ + d); + } + } + KALDI_ASSERT(column_mask.size() == filter_dim); + // select the columns + vectorized_feature_patches_[p].CopyCols(in, column_mask); + } + + // compute filter activations + for (int32 p = 0; p < num_patches; p++) { + CuSubMatrix tgt(out->ColRange(p * num_filters, num_filters)); + tgt.AddVecToRows(1.0, bias_params_, 0.0); // add bias + // apply all filters + tgt.AddMatMat(1.0, vectorized_feature_patches_[p], kNoTrans, + filter_params_, kTrans, 1.0); + } +} + +// scale the parameters +void ConvolutionComponent::Scale(BaseFloat scale) { + filter_params_.Scale(scale); + bias_params_.Scale(scale); +} + +// add another convolution component +void ConvolutionComponent::Add(BaseFloat alpha, const UpdatableComponent &other_in) { + const ConvolutionComponent *other = + dynamic_cast(&other_in); + KALDI_ASSERT(other != NULL); + filter_params_.AddMat(alpha, other->filter_params_); + bias_params_.AddVec(alpha, other->bias_params_); +} + +// back propagation function +void ConvolutionComponent::Backprop(const ChunkInfo &in_info, + const ChunkInfo &out_info, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + Component *to_update_in, + CuMatrix *in_deriv) const { + in_deriv->Resize(in_value.NumRows(), in_value.NumCols(), kSetZero); + ConvolutionComponent *to_update = dynamic_cast(to_update_in); + int32 num_splice = InputDim() / patch_stride_; + int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; + int32 num_filters = filter_params_.NumRows(); + int32 num_frames = in_value.NumRows(); + int32 filter_dim = filter_params_.NumCols(); + + /** Buffer for backpropagation: + * derivatives in the domain of 'vectorized_feature_patches_', + * 1row = vectorized rectangular feature patch, + * 1col = dim over speech frames, + * std::vector-dim = patch-position + */ + std::vector > feature_patch_diffs_; + feature_patch_diffs_.resize(num_patches); + + // backpropagate to vector of matrices + // (corresponding to position of a filter) + for (int32 p = 0; p < num_patches; p++) { + feature_patch_diffs_[p].Resize(num_frames, filter_dim, kSetZero); // reset + CuSubMatrix out_deriv_patch(out_deriv.ColRange(p * num_filters, + num_filters)); + feature_patch_diffs_[p].AddMatMat(1.0, out_deriv_patch, kNoTrans, + filter_params_, kNoTrans, 0.0); + } + + // sum the derivatives into in_deriv, we will compensate #summands + for (int32 p = 0; p < num_patches; p++) { + for (int32 s = 0; s < num_splice; s++) { + CuSubMatrix src(feature_patch_diffs_[p].ColRange(s * patch_dim_, + patch_dim_)); + CuSubMatrix tgt(in_deriv->ColRange(p * patch_step_ + s * patch_stride_, + patch_dim_)); + tgt.AddMat(1.0, src); // sum + } + } + + if (to_update != NULL) { + // Next update the model (must do this 2nd so the derivatives we propagate + // are accurate, in case this == to_update_in.) + to_update->Update(in_value, out_deriv); + } +} + +void ConvolutionComponent::SetZero(bool treat_as_gradient) { + if (treat_as_gradient) { + SetLearningRate(1.0); + } + filter_params_.SetZero(); + bias_params_.SetZero(); + if (treat_as_gradient) { + is_gradient_ = true; + } +} + +void ConvolutionComponent::Read(std::istream &is, bool binary) { + std::ostringstream ostr_beg, ostr_end; + ostr_beg << "<" << Type() << ">"; // e.g. "" + ostr_end << ""; // e.g. "" + // might not see the "" part because + // of how ReadNew() works. + ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), ""); + ReadBasicType(is, binary, &learning_rate_); + ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), ""); + ReadBasicType(is, binary, &patch_dim_); + ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), ""); + ReadBasicType(is, binary, &patch_step_); + ExpectOneOrTwoTokens(is, binary, ostr_beg.str(), ""); + ReadBasicType(is, binary, &patch_stride_); + ExpectToken(is, binary, ""); + filter_params_.Read(is, binary); + ExpectToken(is, binary, ""); + bias_params_.Read(is, binary); + std::string tok; + ReadToken(is, binary, &tok); + if (tok == "") { + ReadBasicType(is, binary, &is_gradient_); + ExpectToken(is, binary, ostr_end.str()); + } else { + is_gradient_ = false; + KALDI_ASSERT(tok == ostr_end.str()); + } +} + +void ConvolutionComponent::Write(std::ostream &os, bool binary) const { + std::ostringstream ostr_beg, ostr_end; + ostr_beg << "<" << Type() << ">"; // e.g. "" + ostr_end << ""; // e.g. "" + WriteToken(os, binary, ostr_beg.str()); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, learning_rate_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, patch_dim_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, patch_step_); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, patch_stride_); + WriteToken(os, binary, ""); + filter_params_.Write(os, binary); + WriteToken(os, binary, ""); + bias_params_.Write(os, binary); + WriteToken(os, binary, ""); + WriteBasicType(os, binary, is_gradient_); + WriteToken(os, binary, ostr_end.str()); +} + +BaseFloat ConvolutionComponent::DotProduct(const UpdatableComponent &other_in) const { + const ConvolutionComponent *other = + dynamic_cast(&other_in); + return TraceMatMat(filter_params_, other->filter_params_, kTrans) + + VecVec(bias_params_, other->bias_params_); +} + +Component* ConvolutionComponent::Copy() const { + ConvolutionComponent *ans = new ConvolutionComponent(); + ans->learning_rate_ = learning_rate_; + ans->patch_dim_ = patch_dim_; + ans->patch_step_ = patch_step_; + ans->patch_stride_ = patch_stride_; + ans->filter_params_ = filter_params_; + ans->bias_params_ = bias_params_; + ans->is_gradient_ = is_gradient_; + return ans; +} + +void ConvolutionComponent::PerturbParams(BaseFloat stddev) { + CuMatrix temp_filter_params(filter_params_); + temp_filter_params.SetRandn(); + filter_params_.AddMat(stddev, temp_filter_params); + + CuVector temp_bias_params(bias_params_); + temp_bias_params.SetRandn(); + bias_params_.AddVec(stddev, temp_bias_params); +} + +void ConvolutionComponent::SetParams(const VectorBase &bias, + const MatrixBase &filter) { + bias_params_ = bias; + filter_params_ = filter; + KALDI_ASSERT(bias_params_.Dim() == filter_params_.NumRows()); +} + +int32 ConvolutionComponent::GetParameterDim() const { + return (filter_params_.NumCols() + 1) * filter_params_.NumRows(); +} + +// update parameters +void ConvolutionComponent::Update(const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv) { + // useful dims + int32 num_patches = 1 + (patch_stride_ - patch_dim_) / patch_step_; + int32 num_filters = filter_params_.NumRows(); + int32 filter_dim = filter_params_.NumCols(); + int32 num_frames = in_value.NumRows(); + int32 num_splice = InputDim() / patch_stride_; + CuMatrix filters_grad; + CuVector bias_grad; + + /** Buffer of reshaped inputs: + * 1row = vectorized rectangular feature patch, + * 1col = dim over speech frames, + * std::vector-dim = patch-position + */ + std::vector > vectorized_feature_patches_; + + // prepare the buffers + if (vectorized_feature_patches_.size() == 0) { + vectorized_feature_patches_.resize(num_patches); + } + + // vectorize the inputs + for (int32 p = 0; p < num_patches; p++) { + vectorized_feature_patches_[p].Resize(num_frames, filter_dim, kSetZero); + // build-up a column selection mask: + std::vector column_mask; + for (int32 s = 0; s < num_splice; s++) { + for (int32 d = 0; d < patch_dim_; d++) { + column_mask.push_back(p * patch_step_ + s * patch_stride_ + d); + } + } + KALDI_ASSERT(column_mask.size() == filter_dim); + // select the columns + vectorized_feature_patches_[p].CopyCols(in_value, column_mask); + } + + // + // calculate the gradient + // + filters_grad.Resize(num_filters, filter_dim, kSetZero); // reset + bias_grad.Resize(num_filters, kSetZero); // reset + // use all the patches + for (int32 p = 0; p < num_patches; p++) { // sum + CuSubMatrix diff_patch(out_deriv.ColRange(p * num_filters, + num_filters)); + filters_grad.AddMatMat(1.0, diff_patch, kTrans, vectorized_feature_patches_[p], + kNoTrans, 1.0); + bias_grad.AddRowSumMat(1.0, diff_patch, 1.0); + } + + // + // update + // + filter_params_.AddMat(learning_rate_, filters_grad); + bias_params_.AddVec(learning_rate_, bias_grad); +} + } // namespace nnet2 } // namespace kaldi diff --git a/src/nnet2/nnet-component.h b/src/nnet2/nnet-component.h index 44a19d28b2d..4c17d7a8bfd 100644 --- a/src/nnet2/nnet-component.h +++ b/src/nnet2/nnet-component.h @@ -1613,6 +1613,70 @@ class AdditiveNoiseComponent: public RandomComponent { BaseFloat stddev_; }; +class ConvolutionComponent: public UpdatableComponent { + public: + ConvolutionComponent(); + // constructor using another component + ConvolutionComponent(const ConvolutionComponent &component); + // constructor using parameters + ConvolutionComponent(const CuMatrixBase &filter_params, + const CuVectorBase &bias_params, + BaseFloat learning_rate); + + int32 InputDim() const; + int32 OutputDim() const; + void Init(BaseFloat learning_rate, int32 input_dim, int32 output_dim, + int32 patch_dim, int32 patch_step, int32 patch_stride, + BaseFloat param_stddev, BaseFloat bias_stddev); + void Init(BaseFloat learning_rate, std::string matrix_filename); + + // resize the component, setting the parameters to zero, while + // leaving any other configuration values the same + void Resize(int32 input_dim, int32 output_dim); + std::string Info() const; + void InitFromString(std::string args); + std::string Type() const { return "ConvolutionComponent"; } + bool BackpropNeedsInput() const { return true; } + bool BackpropNeedsOutput() const { return false; } + using Component::Propagate; // to avoid name hiding + void Propagate(const ChunkInfo &in_info, + const ChunkInfo &out_info, + const CuMatrixBase &in, + CuMatrixBase *out) const; + void Scale(BaseFloat scale); + virtual void Add(BaseFloat alpha, const UpdatableComponent &other); + virtual void Backprop(const ChunkInfo &in_info, + const ChunkInfo &out_info, + const CuMatrixBase &in_value, + const CuMatrixBase &out_value, + const CuMatrixBase &out_deriv, + Component *to_update_in, + CuMatrix *in_deriv) const; + void SetZero(bool treat_as_gradient); + void Read(std::istream &is, bool binary); + void Write(std::ostream &os, bool binary) const; + virtual BaseFloat DotProduct(const UpdatableComponent &other) const; + Component* Copy() const; + void PerturbParams(BaseFloat stddev); + void SetParams(const VectorBase &bias, + const MatrixBase &filter); + const CuVector &BiasParams() { return bias_params_; } + const CuMatrix &LinearParams() { return filter_params_; } + int32 GetParameterDim() const; + void Update(const CuMatrixBase &in_value, + const CuMatrixBase &out_deriv); + + private: + int32 patch_dim_; + int32 patch_step_; + int32 patch_stride_; + + const ConvolutionComponent &operator = (const ConvolutionComponent &other); // Disallow. + CuMatrix filter_params_; + CuVector bias_params_; + bool is_gradient_; +}; + /// Functions used in Init routines. Suppose name=="foo", if "string" has a /// field like foo=12, this function will set "param" to 12 and remove that From 885586f929e274ffd5cb35b4cf781d673f101bdf Mon Sep 17 00:00:00 2001 From: naxingyu Date: Mon, 3 Aug 2015 17:21:58 +0800 Subject: [PATCH 2/7] add Maxpooling component and example script --- egs/hkust/s5/local/nnet2/run_convnet.sh | 53 ++ egs/wsj/s5/steps/nnet2/decode.sh | 7 +- .../s5/steps/nnet2/train_convnet_accel2.sh | 662 ++++++++++++++++++ src/nnet2/nnet-component-test.cc | 26 + src/nnet2/nnet-component.cc | 139 +++- src/nnet2/nnet-component.h | 85 ++- 6 files changed, 968 insertions(+), 4 deletions(-) create mode 100755 egs/hkust/s5/local/nnet2/run_convnet.sh create mode 100755 egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh diff --git a/egs/hkust/s5/local/nnet2/run_convnet.sh b/egs/hkust/s5/local/nnet2/run_convnet.sh new file mode 100755 index 00000000000..ea1d88167e5 --- /dev/null +++ b/egs/hkust/s5/local/nnet2/run_convnet.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# 2015 Xingyu Na +# This runs on the full training set, using ConvNet setup with +# Sigmoid affine layers, on top of fbank features, on GPU. + +temp_dir= +dir=exp/nnet2_convnet +stage=-5 +train_original=data/train +train=data-fb/train + +. ./cmd.sh +. ./path.sh + +. utils/parse_options.sh + +parallel_opts="--gpu 1" # This is suitable for the CLSP network, you'll + # likely have to change it. + +# Make the FBANK features +if [ $stage -le -5 ]; then + # Dev set + utils/copy_data_dir.sh data/dev data-fb/dev || exit 1; rm $train/{cmvn,feats}.scp + steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \ + data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1; + steps/compute_cmvn_stats.sh data-fb/dev data-fb/dev/log data-fb/dev/data || exit 1; + # Training set + utils/copy_data_dir.sh $train_original $train || exit 1; rm $train/{cmvn,feats}.scp + steps/make_fbank.sh --nj 10 --cmd "$train_cmd" \ + $train $train/log $train/data || exit 1; + steps/compute_cmvn_stats.sh $train $train/log $train/data || exit 1; +fi + +( + if [ ! -f $dir/final.mdl ]; then + steps/nnet2/train_convnet_accel2.sh --parallel-opts "$parallel_opts" \ + --cmd "$decode_cmd" --stage $stage \ + --num-threads 1 --minibatch-size 512 \ + --mix-up 20000 --samples-per-iter 300000 \ + --num-epochs 15 --delta-order 2 \ + --initial-effective-lrate 0.0005 --final-effective-lrate 0.000025 \ + --num-jobs-initial 3 --num-jobs-final 8 --num-hidden-layers 4 --splice-width 5 \ + --hidden-dim 2000 --num-filters1 128 --patch-dim1 7 --pool-size 3 \ + --num-filters2 256 --patch-dim2 4 \ + $train data/lang exp/tri5a_ali $dir || exit 1; + fi + + steps/nnet2/decode.sh --cmd "$decode_cmd" --nj 10 \ + --config conf/decode.config \ + exp/tri5a/graph data-fb/dev \ + $dir/decode || exit 1; +) diff --git a/egs/wsj/s5/steps/nnet2/decode.sh b/egs/wsj/s5/steps/nnet2/decode.sh index df8600df32b..753411f4563 100755 --- a/egs/wsj/s5/steps/nnet2/decode.sh +++ b/egs/wsj/s5/steps/nnet2/decode.sh @@ -84,7 +84,12 @@ fi splice_opts=`cat $srcdir/splice_opts 2>/dev/null` case $feat_type in - raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |";; + raw) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- |" + if [ -f $srcdir/delta_order ]; then + delta_order=`cat $srcdir/delta_order 2>/dev/null` + feats="$feats add-deltas --delta-order=$delta_order ark:- ark:- |" + fi + ;; lda) feats="ark,s,cs:apply-cmvn $cmvn_opts --utt2spk=ark:$sdata/JOB/utt2spk scp:$sdata/JOB/cmvn.scp scp:$sdata/JOB/feats.scp ark:- | splice-feats $splice_opts ark:- ark:- | transform-feats $srcdir/final.mat ark:- ark:- |" ;; *) echo "$0: invalid feature type $feat_type" && exit 1; diff --git a/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh new file mode 100755 index 00000000000..1b23ec47cef --- /dev/null +++ b/egs/wsj/s5/steps/nnet2/train_convnet_accel2.sh @@ -0,0 +1,662 @@ +#!/bin/bash + +# Copyright 2012-2014 Johns Hopkins University (Author: Daniel Povey). +# 2013 Xiaohui Zhang +# 2013 Guoguo Chen +# 2014 Vimal Manohar +# 2015 Xingyu Na +# Apache 2.0. + +# train_convnet_accel2.sh is modified from train_pnorm_accel2.sh + +# train_pnorm_accel2.sh is a modified form of train_pnorm_simple2.sh (the "2" +# suffix is because they both use the the "new" egs format, created by +# get_egs2.sh). The "accel" part of the name refers to the fact that this +# script uses a number of jobs that can increase during training. You can +# specify --initial-num-jobs and --final-num-jobs to control these separately. +# Also, in this script, the learning rates specified by --initial-learning-rate +# and --final-learning-rate are the "effective learning rates" (defined as the +# learning rate divided by the number of jobs), and the actual learning rates +# used will be the specified learning rates multiplied by the current number +# of jobs. You'll want to set these lower than you normally would previously +# have set the learning rates, by a factor equal to the (previous) number of +# jobs. + + +# Begin configuration section. +cmd=run.pl +num_epochs=15 # Number of epochs of training; + # the number of iterations is worked out from this. +initial_effective_lrate=0.01 +final_effective_lrate=0.001 +bias_stddev=0.5 +hidden_dim=3000 +minibatch_size=128 # by default use a smallish minibatch size for neural net + # training; this controls instability which would otherwise + # be a problem with multi-threaded update. + +samples_per_iter=400000 # each iteration of training, see this many samples + # per job. This option is passed to get_egs.sh +num_jobs_initial=1 # Number of neural net jobs to run in parallel at the start of training. +num_jobs_final=8 # Number of jobs to run in parallel at the end of training. + +prior_subset_size=10000 # 10k samples per job, for computing priors. Should be + # more than enough. +num_jobs_compute_prior=10 # these are single-threaded, run on CPU. +get_egs_stage=0 +online_ivector_dir= + + +max_models_combine=20 # The "max_models_combine" is the maximum number of models we give + # to the final 'combine' stage, but these models will themselves be averages of + # iteration-number ranges. + +shuffle_buffer_size=5000 # This "buffer_size" variable controls randomization of the samples + # on each iter. You could set it to 0 or to a large value for complete + # randomization, but this would both consume memory and cause spikes in + # disk I/O. Smaller is easier on disk and memory but less random. It's + # not a huge deal though, as samples are anyway randomized right at the start. + # (the point of this is to get data in different minibatches on different iterations, + # since in the preconditioning method, 2 samples in the same minibatch can + # affect each others' gradients. + +add_layers_period=2 # by default, add new layers every 2 iterations. +num_hidden_layers=3 +stage=-4 + +splice_width=4 # meaning +- 4 frames on each side for second LDA +left_context= # if set, overrides splice-width +right_context= # if set, overrides splice-width. +randprune=4.0 # speeds up LDA. +alpha=4.0 # relates to preconditioning. +update_period=4 # relates to online preconditioning: says how often we update the subspace. +num_samples_history=2000 # relates to online preconditioning +max_change_per_sample=0.075 +precondition_rank_in=20 # relates to online preconditioning +precondition_rank_out=80 # relates to online preconditioning + +num_filters1=128 # number of filters in the first convolutional layer +patch_step1=1 # patch step of the first convolutional layer +patch_dim1=7 # dim of convolutional kernel in the first layer +pool_size=3 # size of pooling after the first convolutional layer +num_filters2=256 # number of filters in the second convolutional layer +patch_dim2=4 # dim of convolutional kernel in the second layer + +mix_up=0 # Number of components to mix up to (should be > #tree leaves, if + # specified.) +num_threads=16 +parallel_opts="-pe smp 16 -l ram_free=1G,mem_free=1G" + # by default we use 16 threads; this lets the queue know. + # note: parallel_opts doesn't automatically get adjusted if you adjust num-threads. +combine_num_threads=8 +combine_parallel_opts="-pe smp 8" # queue options for the "combine" stage. +cleanup=true +egs_dir= +lda_opts= +lda_dim= +egs_opts= +delta_order= +io_opts="-tc 5" # for jobs with a lot of I/O, limits the number running at one time. +transform_dir= # If supplied, overrides alidir +postdir= +cmvn_opts= # will be passed to get_lda.sh and get_egs.sh, if supplied. + # only relevant for "raw" features, not lda. +feat_type= # Can be used to force "raw" features. +align_cmd= # The cmd that is passed to steps/nnet2/align.sh +align_use_gpu= # Passed to use_gpu in steps/nnet2/align.sh [yes/no] +realign_times= # List of times on which we realign. Each time is + # floating point number strictly between 0 and 1, which + # will be multiplied by the num-iters to get an iteration + # number. +num_jobs_align=30 # Number of jobs for realignment +srand=0 # random seed used to initialize the nnet +# End configuration section. + + +echo "$0 $@" # Print the command line for logging + +if [ -f path.sh ]; then . ./path.sh; fi +. parse_options.sh || exit 1; + +if [ $# != 4 ]; then + echo "Usage: $0 [opts] " + echo " e.g.: $0 data/train data/lang exp/tri3_ali exp/tri4_nnet" + echo "" + echo "Main options (for others, see top of script file)" + echo " --config # config file containing options" + echo " --cmd (utils/run.pl|utils/queue.pl ) # how to run jobs." + echo " --num-epochs <#epochs|15> # Number of epochs of training" + echo " --initial-effective-lrate # effective learning rate at start of training," + echo " # actual learning-rate is this time num-jobs." + echo " --final-effective-lrate # effective learning rate at end of training." + echo " --num-hidden-layers <#hidden-layers|2> # Number of hidden layers, e.g. 2 for 3 hours of data, 4 for 100hrs" + echo " --add-layers-period <#iters|2> # Number of iterations between adding hidden layers" + echo " --mix-up <#pseudo-gaussians|0> # Can be used to have multiple targets in final output layer," + echo " # per context-dependent state. Try a number several times #states." + echo " --num-jobs-initial # Number of parallel jobs to use for neural net training, at the start." + echo " --num-jobs-final # Number of parallel jobs to use for neural net training, at the end" + echo " --num-threads # Number of parallel threads per job (will affect results" + echo " # as well as speed; may interact with batch size; if you increase" + echo " # this, you may want to decrease the batch size." + echo " --parallel-opts # extra options to pass to e.g. queue.pl for processes that" + echo " # use multiple threads... note, you might have to reduce mem_free,ram_free" + echo " # versus your defaults, because it gets multiplied by the -pe smp argument." + echo " --io-opts # Options given to e.g. queue.pl for jobs that do a lot of I/O." + echo " --minibatch-size # Size of minibatch to process (note: product with --num-threads" + echo " # should not get too large, e.g. >2k)." + echo " --samples-per-iter <#samples|400000> # Number of samples of data to process per iteration, per" + echo " # process." + echo " --splice-width # Number of frames on each side to append for feature input" + echo " # (note: we splice processed, typically 40-dimensional frames" + echo " --lda-dim # Dimension to reduce spliced features to with LDA" + echo " --realign-epochs # A list of space-separated epoch indices the beginning of which" + echo " # realignment is to be done" + echo " --align-cmd (utils/run.pl|utils/queue.pl ) # passed to align.sh" + echo " --align-use-gpu (yes/no) # specify is gpu is to be used for realignment" + echo " --num-jobs-align <#njobs|30> # Number of jobs to perform realignment" + echo " --stage # Used to run a partially-completed training process from somewhere in" + echo " # the middle." + + + exit 1; +fi + +data=$1 +lang=$2 +alidir=$3 +dir=$4 + +if [ ! -z "$realign_times" ]; then + [ -z "$align_cmd" ] && echo "$0: realign_times specified but align_cmd not specified" && exit 1 + [ -z "$align_use_gpu" ] && echo "$0: realign_times specified but align_use_gpu not specified" && exit 1 +fi + +# Check some files. +for f in $data/feats.scp $lang/L.fst $alidir/final.mdl $alidir/tree; do + [ ! -f $f ] && echo "$0: no such file $f" && exit 1; +done + +[ ! -f $postdir/post.1.scp ] && [ ! -f $alidir/ali.1.gz ] && echo "$0: no (soft) alignments provided" && exit 1; + +trap 'for pid in $(jobs -pr); do kill -KILL $pid; done' INT QUIT TERM + +# Set some variables. +num_leaves=`tree-info $alidir/tree 2>/dev/null | grep num-pdfs | awk '{print $2}'` || exit 1 +[ -z $num_leaves ] && echo "\$num_leaves is unset" && exit 1 +[ "$num_leaves" -eq "0" ] && echo "\$num_leaves is 0" && exit 1 + +nj=`cat $alidir/num_jobs` || exit 1; # number of jobs in alignment dir... +# in this dir we'll have just one job. +sdata=$data/split$nj +utils/split_data.sh $data $nj + +mkdir -p $dir/log +echo $nj > $dir/num_jobs +cp $alidir/tree $dir + +extra_opts=() +[ ! -z "$cmvn_opts" ] && extra_opts+=(--cmvn-opts "$cmvn_opts") +[ ! -z "$feat_type" ] && extra_opts+=(--feat-type $feat_type) +[ ! -z "$delta_order" ] && extra_opts+=(--delta-order $delta_order) +[ ! -z "$online_ivector_dir" ] && extra_opts+=(--online-ivector-dir $online_ivector_dir) +[ -z "$transform_dir" ] && transform_dir=$alidir +extra_opts+=(--transform-dir $transform_dir) +[ -z "$left_context" ] && left_context=$splice_width +[ -z "$right_context" ] && right_context=$splice_width +extra_opts+=(--left-context $left_context --right-context $right_context) + +feat-to-dim scp:$sdata/1/feats.scp - > $dir/feat_dim +feat_dim=$(cat $dir/feat_dim) || exit 1; + +if [ $stage -le -3 ] && [ -z "$egs_dir" ]; then + echo "$0: calling get_egs2.sh" + steps/nnet2/get_egs2.sh $egs_opts "${extra_opts[@]}" --io-opts "$io_opts" \ + --postdir "$postdir" --samples-per-iter $samples_per_iter --stage $get_egs_stage \ + --cmd "$cmd" --feat-type "raw" $data $alidir $dir/egs || exit 1; +fi + +if [ -f $dir/egs/cmvn_opts ]; then + cp $dir/egs/cmvn_opts $dir +fi + +if [ -f $dir/egs/delta_order ]; then + cp $dir/egs/delta_order $dir +fi + +if [ -z $egs_dir ]; then + egs_dir=$dir/egs +fi + +frames_per_eg=$(cat $egs_dir/info/frames_per_eg) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } +num_archives=$(cat $egs_dir/info/num_archives) || { echo "error: no such file $egs_dir/info/frames_per_eg"; exit 1; } + +# num_archives_expanded considers each separate label-position from +# 0..frames_per_eg-1 to be a separate archive. +num_archives_expanded=$[$num_archives*$frames_per_eg] + +[ $num_jobs_initial -gt $num_jobs_final ] && \ + echo "$0: --initial-num-jobs cannot exceed --final-num-jobs" && exit 1; + +[ $num_jobs_final -gt $num_archives_expanded ] && \ + echo "$0: --final-num-jobs cannot exceed #archives $num_archives_expanded." && exit 1; + +if ! [ $num_hidden_layers -ge 1 ]; then + echo "Invalid num-hidden-layers $num_hidden_layers" + exit 1 +fi + +if [ $stage -le -2 ]; then + echo "$0: initializing neural net"; + tot_splice=$[($delta_order+1)*($left_context+1+$right_context)] + delta_feat_dim=$[($delta_order+1)*$feat_dim] + tot_input_dim=$[$feat_dim*$tot_splice] + num_patch1=$[1+($feat_dim-$patch_dim1)/$patch_step1] + num_pool=$[$num_patch1/$pool_size] + patch_dim2=$[$patch_dim2*$num_filters1] + patch_step2=$num_filters1 + patch_stride2=$[$num_pool*$num_filters1] # same as pool outputs + num_patch2=$[1+($num_pool*$num_filters1-$patch_dim2)/$patch_step2] + conv_out_dim1=$[$num_filters1*$num_patch1] # 128 x (36 - 7 + 1) + pool_out_dim=$[$num_filters1*$num_pool] + conv_out_dim2=$[$num_filters2*$num_patch2] + + online_preconditioning_opts="alpha=$alpha num-samples-history=$num_samples_history update-period=$update_period rank-in=$precondition_rank_in rank-out=$precondition_rank_out max-change-per-sample=$max_change_per_sample" + + initial_lrate=$(perl -e "print ($initial_effective_lrate*$num_jobs_initial);") + stddev=`perl -e "print 1.0/sqrt($hidden_dim);"` + cat >$dir/nnet.config <$dir/replace.1.config <$dir/replace.2.config <