kaldi-asr · danpovey · Oct 4, 2018 · Sep 3, 2018 · Sep 8, 2018 · Sep 8, 2018
diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh
@@ -54,7 +54,7 @@ fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print
 
 model=$srcdir/$iter.raw
 if [ ! -f $srcdir/$iter.raw ]; then
-  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead." && exit 1
+  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead."
   model=$srcdir/$iter.mdl
 fi
 
@@ -104,12 +104,15 @@ gpu_queue_opt=
 
 if $use_gpu; then
   gpu_queue_opt="--gpu 1"
+  suffix="-batch"
   gpu_opt="--use-gpu=yes"
+else
+  gpu_opt="--use-gpu=no"
 fi
 
 if [ $stage -le 2 ]; then
   $cmd $gpu_queue_opt JOB=1:$nj $dir/log/compute_output.JOB.log \
-    nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \
+    nnet3-compute$suffix $gpu_opt $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \
      --extra-right-context=$extra_right_context \

diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -20,6 +20,10 @@ ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
 num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
 scoring_opts=
 skip_diagnostics=false
 skip_scoring=false
@@ -49,6 +53,9 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
   exit 1;
 fi
 
@@ -74,7 +81,16 @@ done
 sdata=$data/split$nj;
 cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+if $use_gpu; then
+  if [ $num_threads -eq 1 ]; then
+    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+  fi
+  thread_string="-batch --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads --gpu 1"
+elif [ $num_threads -gt 1 ]; then
+  thread_string="-parallel --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads"
+fi
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@@ -104,7 +120,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
 fi
 
 if [ $stage -le 1 ]; then
-  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
     nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \

diff --git a/egs/wsj/s5/utils/parallel/queue.pl b/egs/wsj/s5/utils/parallel/queue.pl
@@ -176,7 +176,7 @@ sub caught_signal {
 option max_jobs_run=* -tc $0
 default gpu=0
 option gpu=0
-option gpu=* -l gpu=$0 -q g.q
+option gpu=* -l gpu=$0 -q '*.q'
 EOF
 
 # Here the configuration options specified by the user on the command line

diff --git a/src/base/Makefile b/src/base/Makefile
@@ -18,7 +18,7 @@ include ../kaldi.mk
 
 TESTFILES = kaldi-math-test io-funcs-test kaldi-error-test timer-test
 
-OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o
+OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o timer.o
 
 LIBNAME = kaldi-base
 

diff --git a/src/base/kaldi-common.h b/src/base/kaldi-common.h
@@ -36,5 +36,6 @@
 #include "base/kaldi-types.h"
 #include "base/io-funcs.h"
 #include "base/kaldi-math.h"
+#include "base/timer.h"
 
 #endif  // KALDI_BASE_KALDI_COMMON_H_
diff --git a/src/base/timer.cc b/src/base/timer.cc
@@ -0,0 +1,85 @@
+// base/timer.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/timer.h"
+#include "base/kaldi-error.h"
+#include <algorithm>
+#include <iomanip>
+#include <map>
+#include <unordered_map>
+
+namespace kaldi {
+
+class ProfileStats {
+ public:
+  void AccStats(const char *function_name, double elapsed) {
+    std::unordered_map<const char*, ProfileStatsEntry>::iterator
+        iter = map_.find(function_name);
+    if (iter == map_.end()) {
+      map_[function_name] = ProfileStatsEntry(function_name);
+      map_[function_name].total_time = elapsed;
+    } else {
+      iter->second.total_time += elapsed;
+    }
+  }
+  ~ProfileStats() {
+    // This map makes sure we agglomerate the time if there were any duplicate
+    // addresses of strings.
+    std::unordered_map<std::string, double> total_time;
+    for (auto iter = map_.begin(); iter != map_.end(); iter++)
+      total_time[iter->second.name] += iter->second.total_time;
+
+    ReverseSecondComparator comp;
+    std::vector<std::pair<std::string, double> > pairs(total_time.begin(),
+                                                       total_time.end());
+    std::sort(pairs.begin(), pairs.end(), comp);
+    for (size_t i = 0; i < pairs.size(); i++) {
+      KALDI_LOG << "Time taken in " << pairs[i].first << " is "
+                << std::fixed << std::setprecision(2) << pairs[i].second << "s.";
+    }
+  }
+ private:
+
+  struct ProfileStatsEntry {
+    std::string name;
+    double total_time;
+    ProfileStatsEntry() { }
+    ProfileStatsEntry(const char *name): name(name) { }
+  };
+
+  struct ReverseSecondComparator {
+    bool operator () (const std::pair<std::string, double> &a,
+                      const std::pair<std::string, double> &b) {
+      return a.second > b.second;
+    }
+  };
+
+  // Note: this map is keyed on the address of the string, there is no proper
+  // hash function.  The assumption is that the strings are compile-time
+  // constants.
+  std::unordered_map<const char*, ProfileStatsEntry> map_;
+};
+
+ProfileStats g_profile_stats;
+
+Profiler::~Profiler() {
+  g_profile_stats.AccStats(name_, tim_.Elapsed());
+}
+
+}  // namespace kaldi
diff --git a/src/base/timer.h b/src/base/timer.h
@@ -20,7 +20,7 @@
 #define KALDI_BASE_TIMER_H_
 
 #include "base/kaldi-utils.h"
-// Note: Sleep(float secs) is included in base/kaldi-utils.h.
+#include "base/kaldi-error.h"
 
 
 #if defined(_MSC_VER) || defined(MINGW)
@@ -87,7 +87,27 @@ class Timer {
   struct timeval time_start_;
   struct timezone time_zone_;
 };
-}
+
+class Profiler {
+ public:
+  // Caution: the 'const char' should always be a string constant; for speed,
+  // internally the profiling code uses the address of it as a lookup key.
+  Profiler(const char *function_name): name_(function_name) { }
+  ~Profiler();
+ private:
+  Timer tim_;
+  const char *name_;
+};
+
+//  To add timing info for a function, you just put
+//  KALDI_PROFILE;
+//  at the beginning of the function.  Caution: this doesn't
+//  include the class name.
+#define KALDI_PROFILE Profiler _profiler(__func__)
+
+
+
+}  // namespace kaldi
 
 #endif
 

diff --git a/src/bin/Makefile b/src/bin/Makefile
@@ -29,8 +29,8 @@ OBJFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 TESTFILES =

diff --git a/src/bin/copy-post.cc b/src/bin/copy-post.cc
@@ -26,13 +26,13 @@
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
-    typedef kaldi::int32 int32;  
+    typedef kaldi::int32 int32;
 
     const char *usage =
         "Copy archives of posteriors, with optional scaling\n"
-        "(Also see rand-prune-post and sum-post)\n"
         "\n"
-        "Usage: copy-post <post-rspecifier> <post-wspecifier>\n";
+        "Usage: copy-post <post-rspecifier> <post-wspecifier>\n"
+        "See also: post-to-weights, scale-post, sum-post, weight-post ...\n";
 
     BaseFloat scale = 1.0;
     ParseOptions po(usage);
@@ -43,15 +43,15 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       exit(1);
     }
-      
+
     std::string post_rspecifier = po.GetArg(1),
         post_wspecifier = po.GetArg(2);
 
     kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier);
-    kaldi::PosteriorWriter posterior_writer(post_wspecifier); 
+    kaldi::PosteriorWriter posterior_writer(post_wspecifier);
 
     int32 num_done = 0;
-   
+
     for (; !posterior_reader.Done(); posterior_reader.Next()) {
       std::string key = posterior_reader.Key();
 
@@ -71,4 +71,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/chain/Makefile b/src/chain/Makefile
@@ -18,8 +18,7 @@ LIBNAME = kaldi-chain
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)

diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
@@ -25,7 +25,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
@@ -18,8 +18,7 @@ endif
 
 LIBNAME = kaldi-cudamatrix
 
-ADDLIBS = ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)

diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
@@ -586,6 +586,23 @@ void CuMemoryAllocator::SortSubregions() {
   }
 }
 
+CuMemoryAllocator::~CuMemoryAllocator() {
+  // We mainly free these blocks of memory so that cuda-memcheck doesn't report
+  // spurious errors.
+  for (size_t i = 0; i < memory_regions_.size(); i++) {
+    // No need to check the return status here-- the program is exiting anyway.
+    cudaFree(memory_regions_[i].begin);
+  }
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    SubRegion *subregion = subregions_[i];
+    for (auto iter = subregion->free_blocks.begin();
+         iter != subregion->free_blocks.end(); ++iter)
+      delete iter->second;
+    delete subregion;
+  }
+}
+
+
 CuMemoryAllocator g_cuda_allocator;
 
 

diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
@@ -54,7 +54,7 @@ struct CuAllocatorOptions {
   bool cache_memory;
 
   // The proportion of the device's memory that the CuAllocator allocates to
-  // start with; by default this is 0.8, although if you want to share the
+  // start with; by default this is 0.5, although if you want to share the
   // device (not recommended!) you should set this lower.
   BaseFloat memory_proportion;
 
@@ -187,6 +187,8 @@ class CuMemoryAllocator {
   // by the user (c.f. RegisterCuAllocatorOptions()) before the options are read.
   void SetOptions(const CuAllocatorOptions &opts) { opts_ = opts; }
 
+  ~CuMemoryAllocator();
+
  private:
 
   struct SubRegion;

diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
@@ -102,7 +102,7 @@ void CuDevice::Initialize() {
     if (!multi_threaded_) {
       multi_threaded_ = true;
       KALDI_WARN << "For multi-threaded code that might use GPU, you should call "
-          "CuDevice()::Instantiate().AllowMultithreading() at the start of "
+          "CuDevice::Instantiate().AllowMultithreading() at the start of "
           "the program.";
     }
     device_id_copy_ = device_id_;

diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h
@@ -36,6 +36,7 @@ inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
     // initializer, so nothing to do.
   } else {
     KALDI_ASSERT(row_offset >= 0 && col_offset >= 0 &&
+                 num_rows >= 0 && num_cols >= 0 &&
                  row_offset + num_rows <= mat.num_rows_ &&
                  col_offset + num_cols <= mat.num_cols_);
     this->data_ = mat.data_ + static_cast<size_t>(col_offset) +
@@ -68,5 +69,3 @@ inline CuSubMatrix<Real>::CuSubMatrix(const Real *data,
 } // namespace kaldi
 
 #endif
-
-
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
@@ -7,14 +7,13 @@ TESTFILES =
 
 OBJFILES = training-graph-compiler.o lattice-simple-decoder.o lattice-faster-decoder.o \
    lattice-faster-online-decoder.o simple-decoder.o faster-decoder.o \
-   decoder-wrappers.o grammar-fst.o
+   decoder-wrappers.o grammar-fst.o decodable-matrix.o
 
 LIBNAME = kaldi-decoder
 
-ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
+ADDLIBS = ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../fstext/kaldi-fstext.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk