diff --git a/egs/wsj/s5/steps/nnet3/compute_output.sh b/egs/wsj/s5/steps/nnet3/compute_output.sh
index da3cb704878..e55f705043b 100755
--- a/egs/wsj/s5/steps/nnet3/compute_output.sh
+++ b/egs/wsj/s5/steps/nnet3/compute_output.sh
@@ -54,7 +54,7 @@ fdir=`perl -e '($dir,$pwd)= @ARGV; if($dir!~m:^/:) { $dir = "$pwd/$dir"; } print
 
 model=$srcdir/$iter.raw
 if [ ! -f $srcdir/$iter.raw ]; then
-  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead." && exit 1
+  echo "$0: WARNING: no such file $srcdir/$iter.raw. Trying $srcdir/$iter.mdl instead."
   model=$srcdir/$iter.mdl
 fi
 
@@ -104,12 +104,15 @@ gpu_queue_opt=
 
 if $use_gpu; then
   gpu_queue_opt="--gpu 1"
+  suffix="-batch"
   gpu_opt="--use-gpu=yes"
+else
+  gpu_opt="--use-gpu=no"
 fi
 
 if [ $stage -le 2 ]; then
   $cmd $gpu_queue_opt JOB=1:$nj $dir/log/compute_output.JOB.log \
-    nnet3-compute $gpu_opt $ivector_opts $frame_subsampling_opt \
+    nnet3-compute$suffix $gpu_opt $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \
      --extra-right-context=$extra_right_context \
diff --git a/egs/wsj/s5/steps/nnet3/decode.sh b/egs/wsj/s5/steps/nnet3/decode.sh
index 5b8374a5a1d..14dda2bd457 100755
--- a/egs/wsj/s5/steps/nnet3/decode.sh
+++ b/egs/wsj/s5/steps/nnet3/decode.sh
@@ -20,6 +20,10 @@ ivector_scale=1.0
 lattice_beam=8.0 # Beam we use in lattice generation.
 iter=final
 num_threads=1 # if >1, will use gmm-latgen-faster-parallel
+use_gpu=false # If true, will use a GPU, with nnet3-latgen-faster-batch.
+              # In that case it is recommended to set num-threads to a large
+              # number, e.g. 20 if you have that many free CPU slots on a GPU
+              # node, and to use a small number of jobs.
 scoring_opts=
 skip_diagnostics=false
 skip_scoring=false
@@ -49,6 +53,9 @@ if [ $# -ne 3 ]; then
   echo "  --iter <iter>                            # Iteration of model to decode; default is final."
   echo "  --scoring-opts <string>                  # options to local/score.sh"
   echo "  --num-threads <n>                        # number of threads to use, default 1."
+  echo "  --use-gpu <true|false>                   # default: false.  If true, we recommend"
+  echo "                                           # to use large --num-threads as the graph"
+  echo "                                           # search becomes the limiting factor."
   exit 1;
 fi
 
@@ -74,7 +81,16 @@ done
 sdata=$data/split$nj;
 cmvn_opts=`cat $srcdir/cmvn_opts` || exit 1;
 thread_string=
-[ $num_threads -gt 1 ] && thread_string="-parallel --num-threads=$num_threads"
+if $use_gpu; then
+  if [ $num_threads -eq 1 ]; then
+    echo "$0: **Warning: we recommend to use --num-threads > 1 for GPU-based decoding."
+  fi
+  thread_string="-batch --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads --gpu 1"
+elif [ $num_threads -gt 1 ]; then
+  thread_string="-parallel --num-threads=$num_threads"
+  queue_opt="--num-threads $num_threads"
+fi
 
 mkdir -p $dir/log
 [[ -d $sdata && $data/feats.scp -ot $sdata ]] || split_data.sh $data $nj || exit 1;
@@ -104,7 +120,7 @@ if [ -f $srcdir/frame_subsampling_factor ]; then
 fi
 
 if [ $stage -le 1 ]; then
-  $cmd --num-threads $num_threads JOB=1:$nj $dir/log/decode.JOB.log \
+  $cmd $queue_opt JOB=1:$nj $dir/log/decode.JOB.log \
     nnet3-latgen-faster$thread_string $ivector_opts $frame_subsampling_opt \
      --frames-per-chunk=$frames_per_chunk \
      --extra-left-context=$extra_left_context \
diff --git a/egs/wsj/s5/utils/parallel/queue.pl b/egs/wsj/s5/utils/parallel/queue.pl
index e14af5ef6e3..bddcb4fec23 100755
--- a/egs/wsj/s5/utils/parallel/queue.pl
+++ b/egs/wsj/s5/utils/parallel/queue.pl
@@ -176,7 +176,7 @@ sub caught_signal {
 option max_jobs_run=* -tc $0
 default gpu=0
 option gpu=0
-option gpu=* -l gpu=$0 -q g.q
+option gpu=* -l gpu=$0 -q '*.q'
 EOF
 
 # Here the configuration options specified by the user on the command line
diff --git a/src/base/Makefile b/src/base/Makefile
index 583c6badcf2..49af4f87ff4 100644
--- a/src/base/Makefile
+++ b/src/base/Makefile
@@ -18,7 +18,7 @@ include ../kaldi.mk
 
 TESTFILES = kaldi-math-test io-funcs-test kaldi-error-test timer-test
 
-OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o
+OBJFILES = kaldi-math.o kaldi-error.o io-funcs.o kaldi-utils.o timer.o
 
 LIBNAME = kaldi-base
 
diff --git a/src/base/kaldi-common.h b/src/base/kaldi-common.h
index e0002d91bb7..264565d1812 100644
--- a/src/base/kaldi-common.h
+++ b/src/base/kaldi-common.h
@@ -36,5 +36,6 @@
 #include "base/kaldi-types.h"
 #include "base/io-funcs.h"
 #include "base/kaldi-math.h"
+#include "base/timer.h"
 
 #endif  // KALDI_BASE_KALDI_COMMON_H_
diff --git a/src/base/timer.cc b/src/base/timer.cc
new file mode 100644
index 00000000000..ce4ef292783
--- /dev/null
+++ b/src/base/timer.cc
@@ -0,0 +1,85 @@
+// base/timer.cc
+
+// Copyright 2018  Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//  http://www.apache.org/licenses/LICENSE-2.0
+
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "base/timer.h"
+#include "base/kaldi-error.h"
+#include <algorithm>
+#include <iomanip>
+#include <map>
+#include <unordered_map>
+
+namespace kaldi {
+
+class ProfileStats {
+ public:
+  void AccStats(const char *function_name, double elapsed) {
+    std::unordered_map<const char*, ProfileStatsEntry>::iterator
+        iter = map_.find(function_name);
+    if (iter == map_.end()) {
+      map_[function_name] = ProfileStatsEntry(function_name);
+      map_[function_name].total_time = elapsed;
+    } else {
+      iter->second.total_time += elapsed;
+    }
+  }
+  ~ProfileStats() {
+    // This map makes sure we agglomerate the time if there were any duplicate
+    // addresses of strings.
+    std::unordered_map<std::string, double> total_time;
+    for (auto iter = map_.begin(); iter != map_.end(); iter++)
+      total_time[iter->second.name] += iter->second.total_time;
+
+    ReverseSecondComparator comp;
+    std::vector<std::pair<std::string, double> > pairs(total_time.begin(),
+                                                       total_time.end());
+    std::sort(pairs.begin(), pairs.end(), comp);
+    for (size_t i = 0; i < pairs.size(); i++) {
+      KALDI_LOG << "Time taken in " << pairs[i].first << " is "
+                << std::fixed << std::setprecision(2) << pairs[i].second << "s.";
+    }
+  }
+ private:
+
+  struct ProfileStatsEntry {
+    std::string name;
+    double total_time;
+    ProfileStatsEntry() { }
+    ProfileStatsEntry(const char *name): name(name) { }
+  };
+
+  struct ReverseSecondComparator {
+    bool operator () (const std::pair<std::string, double> &a,
+                      const std::pair<std::string, double> &b) {
+      return a.second > b.second;
+    }
+  };
+
+  // Note: this map is keyed on the address of the string, there is no proper
+  // hash function.  The assumption is that the strings are compile-time
+  // constants.
+  std::unordered_map<const char*, ProfileStatsEntry> map_;
+};
+
+ProfileStats g_profile_stats;
+
+Profiler::~Profiler() {
+  g_profile_stats.AccStats(name_, tim_.Elapsed());
+}
+
+}  // namespace kaldi
diff --git a/src/base/timer.h b/src/base/timer.h
index 7889c4a258b..96c5babb305 100644
--- a/src/base/timer.h
+++ b/src/base/timer.h
@@ -20,7 +20,7 @@
 #define KALDI_BASE_TIMER_H_
 
 #include "base/kaldi-utils.h"
-// Note: Sleep(float secs) is included in base/kaldi-utils.h.
+#include "base/kaldi-error.h"
 
 
 #if defined(_MSC_VER) || defined(MINGW)
@@ -87,7 +87,27 @@ class Timer {
   struct timeval time_start_;
   struct timezone time_zone_;
 };
-}
+
+class Profiler {
+ public:
+  // Caution: the 'const char' should always be a string constant; for speed,
+  // internally the profiling code uses the address of it as a lookup key.
+  Profiler(const char *function_name): name_(function_name) { }
+  ~Profiler();
+ private:
+  Timer tim_;
+  const char *name_;
+};
+
+//  To add timing info for a function, you just put
+//  KALDI_PROFILE;
+//  at the beginning of the function.  Caution: this doesn't
+//  include the class name.
+#define KALDI_PROFILE Profiler _profiler(__func__)
+
+
+
+}  // namespace kaldi
 
 #endif
 
diff --git a/src/bin/Makefile b/src/bin/Makefile
index c2b9eb48830..b0a660a6ad1 100644
--- a/src/bin/Makefile
+++ b/src/bin/Makefile
@@ -29,8 +29,8 @@ OBJFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 TESTFILES =
diff --git a/src/bin/copy-post.cc b/src/bin/copy-post.cc
index 6d0d351a594..d5ca3f42980 100644
--- a/src/bin/copy-post.cc
+++ b/src/bin/copy-post.cc
@@ -26,13 +26,13 @@
 int main(int argc, char *argv[]) {
   try {
     using namespace kaldi;
-    typedef kaldi::int32 int32;  
+    typedef kaldi::int32 int32;
 
     const char *usage =
         "Copy archives of posteriors, with optional scaling\n"
-        "(Also see rand-prune-post and sum-post)\n"
         "\n"
-        "Usage: copy-post <post-rspecifier> <post-wspecifier>\n";
+        "Usage: copy-post <post-rspecifier> <post-wspecifier>\n"
+        "See also: post-to-weights, scale-post, sum-post, weight-post ...\n";
 
     BaseFloat scale = 1.0;
     ParseOptions po(usage);
@@ -43,15 +43,15 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       exit(1);
     }
-      
+
     std::string post_rspecifier = po.GetArg(1),
         post_wspecifier = po.GetArg(2);
 
     kaldi::SequentialPosteriorReader posterior_reader(post_rspecifier);
-    kaldi::PosteriorWriter posterior_writer(post_wspecifier); 
+    kaldi::PosteriorWriter posterior_writer(post_wspecifier);
 
     int32 num_done = 0;
-   
+
     for (; !posterior_reader.Done(); posterior_reader.Next()) {
       std::string key = posterior_reader.Key();
 
@@ -71,4 +71,3 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 }
-
diff --git a/src/chain/Makefile b/src/chain/Makefile
index 2a735c2ca2d..fbad28f7de6 100644
--- a/src/chain/Makefile
+++ b/src/chain/Makefile
@@ -18,8 +18,7 @@ LIBNAME = kaldi-chain
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/chainbin/Makefile b/src/chainbin/Makefile
index 61f653f174f..41ac7342d17 100644
--- a/src/chainbin/Makefile
+++ b/src/chainbin/Makefile
@@ -25,7 +25,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/cudamatrix/Makefile b/src/cudamatrix/Makefile
index ca831390ea9..45c2ba44fd7 100644
--- a/src/cudamatrix/Makefile
+++ b/src/cudamatrix/Makefile
@@ -18,8 +18,7 @@ endif
 
 LIBNAME = kaldi-cudamatrix
 
-ADDLIBS = ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 # Make sure we have CUDA_ARCH from kaldi.mk,
 ifeq ($(CUDA), true)
diff --git a/src/cudamatrix/cu-allocator.cc b/src/cudamatrix/cu-allocator.cc
index cfbc6757530..bda76bd5589 100644
--- a/src/cudamatrix/cu-allocator.cc
+++ b/src/cudamatrix/cu-allocator.cc
@@ -586,6 +586,23 @@ void CuMemoryAllocator::SortSubregions() {
   }
 }
 
+CuMemoryAllocator::~CuMemoryAllocator() {
+  // We mainly free these blocks of memory so that cuda-memcheck doesn't report
+  // spurious errors.
+  for (size_t i = 0; i < memory_regions_.size(); i++) {
+    // No need to check the return status here-- the program is exiting anyway.
+    cudaFree(memory_regions_[i].begin);
+  }
+  for (size_t i = 0; i < subregions_.size(); i++) {
+    SubRegion *subregion = subregions_[i];
+    for (auto iter = subregion->free_blocks.begin();
+         iter != subregion->free_blocks.end(); ++iter)
+      delete iter->second;
+    delete subregion;
+  }
+}
+
+
 CuMemoryAllocator g_cuda_allocator;
 
 
diff --git a/src/cudamatrix/cu-allocator.h b/src/cudamatrix/cu-allocator.h
index 20425704a2b..9dd2bb82aea 100644
--- a/src/cudamatrix/cu-allocator.h
+++ b/src/cudamatrix/cu-allocator.h
@@ -54,7 +54,7 @@ struct CuAllocatorOptions {
   bool cache_memory;
 
   // The proportion of the device's memory that the CuAllocator allocates to
-  // start with; by default this is 0.8, although if you want to share the
+  // start with; by default this is 0.5, although if you want to share the
   // device (not recommended!) you should set this lower.
   BaseFloat memory_proportion;
 
@@ -187,6 +187,8 @@ class CuMemoryAllocator {
   // by the user (c.f. RegisterCuAllocatorOptions()) before the options are read.
   void SetOptions(const CuAllocatorOptions &opts) { opts_ = opts; }
 
+  ~CuMemoryAllocator();
+
  private:
 
   struct SubRegion;
diff --git a/src/cudamatrix/cu-device.cc b/src/cudamatrix/cu-device.cc
index 37912ea8adf..49c179b3673 100644
--- a/src/cudamatrix/cu-device.cc
+++ b/src/cudamatrix/cu-device.cc
@@ -102,7 +102,7 @@ void CuDevice::Initialize() {
     if (!multi_threaded_) {
       multi_threaded_ = true;
       KALDI_WARN << "For multi-threaded code that might use GPU, you should call "
-          "CuDevice()::Instantiate().AllowMultithreading() at the start of "
+          "CuDevice::Instantiate().AllowMultithreading() at the start of "
           "the program.";
     }
     device_id_copy_ = device_id_;
diff --git a/src/cudamatrix/cu-matrix-inl.h b/src/cudamatrix/cu-matrix-inl.h
index 9b7a707d2e5..0e182d4e72a 100644
--- a/src/cudamatrix/cu-matrix-inl.h
+++ b/src/cudamatrix/cu-matrix-inl.h
@@ -36,6 +36,7 @@ inline CuSubMatrix<Real>::CuSubMatrix(const CuMatrixBase<Real> &mat,
     // initializer, so nothing to do.
   } else {
     KALDI_ASSERT(row_offset >= 0 && col_offset >= 0 &&
+                 num_rows >= 0 && num_cols >= 0 &&
                  row_offset + num_rows <= mat.num_rows_ &&
                  col_offset + num_cols <= mat.num_cols_);
     this->data_ = mat.data_ + static_cast<size_t>(col_offset) +
@@ -68,5 +69,3 @@ inline CuSubMatrix<Real>::CuSubMatrix(const Real *data,
 } // namespace kaldi
 
 #endif
-
-
diff --git a/src/decoder/Makefile b/src/decoder/Makefile
index 35c84758779..020fe358fe9 100644
--- a/src/decoder/Makefile
+++ b/src/decoder/Makefile
@@ -7,14 +7,13 @@ TESTFILES =
 
 OBJFILES = training-graph-compiler.o lattice-simple-decoder.o lattice-faster-decoder.o \
    lattice-faster-online-decoder.o simple-decoder.o faster-decoder.o \
-   decoder-wrappers.o grammar-fst.o
+   decoder-wrappers.o grammar-fst.o decodable-matrix.o
 
 LIBNAME = kaldi-decoder
 
-ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
+ADDLIBS = ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../fstext/kaldi-fstext.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/decoder/decodable-matrix.cc b/src/decoder/decodable-matrix.cc
new file mode 100644
index 00000000000..3cc7b87f2d7
--- /dev/null
+++ b/src/decoder/decodable-matrix.cc
@@ -0,0 +1,107 @@
+// decoder/decodable-matrix.cc
+
+// Copyright    2018 Johns Hopkins University (author: Daniel Povey)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+
+DecodableMatrixMapped::DecodableMatrixMapped(
+    const TransitionModel &tm,
+    const MatrixBase<BaseFloat> &likes,
+    int32 frame_offset):
+    trans_model_(tm), likes_(&likes), likes_to_delete_(NULL),
+    frame_offset_(frame_offset) {
+  stride_ = likes.Stride();
+  raw_data_ = likes.Data() - (stride_ * frame_offset);
+
+  if (likes.NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has "
+              << likes.NumCols() << " rows but transition-model has "
+              << tm.NumPdfs() << " pdf-ids.";
+}
+
+DecodableMatrixMapped::DecodableMatrixMapped(
+    const TransitionModel &tm, const Matrix<BaseFloat> *likes,
+    int32 frame_offset):
+    trans_model_(tm), likes_(likes), likes_to_delete_(likes),
+    frame_offset_(frame_offset) {
+  stride_ = likes->Stride();
+  raw_data_ = likes->Data() - (stride_ * frame_offset_);
+  if (likes->NumCols() != tm.NumPdfs())
+    KALDI_ERR << "Mismatch, matrix has "
+              << likes->NumCols() << " rows but transition-model has "
+              << tm.NumPdfs() << " pdf-ids.";
+}
+
+
+BaseFloat DecodableMatrixMapped::LogLikelihood(int32 frame, int32 tid) {
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(tid);
+#ifdef KALDI_PARANOID
+  return (*likes_)(frame - frame_offset_, pdf_id);
+#else
+  return raw_data_[frame * stride_ + pdf_id];
+#endif
+}
+
+int32 DecodableMatrixMapped::NumFramesReady() const {
+  return frame_offset_ + likes_->NumRows();
+}
+
+bool DecodableMatrixMapped::IsLastFrame(int32 frame) const {
+  KALDI_ASSERT(frame < NumFramesReady());
+  return (frame == NumFramesReady() - 1);
+}
+
+// Indices are one-based!  This is for compatibility with OpenFst.
+int32 DecodableMatrixMapped::NumIndices() const {
+  return trans_model_.NumTransitionIds();
+}
+
+DecodableMatrixMapped::~DecodableMatrixMapped() {
+  delete likes_to_delete_;
+}
+
+
+void DecodableMatrixMappedOffset::AcceptLoglikes(
+    Matrix<BaseFloat> *loglikes, int32 frames_to_discard) {
+  if (loglikes->NumRows() == 0) return;
+  KALDI_ASSERT(loglikes->NumCols() == trans_model_.NumPdfs());
+  KALDI_ASSERT(frames_to_discard <= loglikes_.NumRows() &&
+               frames_to_discard >= 0);
+  if (frames_to_discard == loglikes_.NumRows()) {
+    loglikes_.Swap(loglikes);
+    loglikes->Resize(0, 0);
+  } else {
+    int32 old_rows_kept = loglikes_.NumRows() - frames_to_discard,
+        new_num_rows = old_rows_kept + loglikes->NumRows();
+    Matrix<BaseFloat> new_loglikes(new_num_rows, loglikes->NumCols());
+    new_loglikes.RowRange(0, old_rows_kept).CopyFromMat(
+        loglikes_.RowRange(frames_to_discard, old_rows_kept));
+    new_loglikes.RowRange(old_rows_kept, loglikes->NumRows()).CopyFromMat(
+        *loglikes);
+    loglikes_.Swap(&new_loglikes);
+  }
+  frame_offset_ += frames_to_discard;
+  stride_ = loglikes_.Stride();
+  raw_data_ = loglikes_.Data() - (frame_offset_ * stride_);
+}
+
+
+
+} // end namespace kaldi.
diff --git a/src/decoder/decodable-matrix.h b/src/decoder/decodable-matrix.h
index de70ea82753..f32a007e6ca 100644
--- a/src/decoder/decodable-matrix.h
+++ b/src/decoder/decodable-matrix.h
@@ -32,8 +32,7 @@ namespace kaldi {
 
 class DecodableMatrixScaledMapped: public DecodableInterface {
  public:
-  // This constructor creates an object that will not delete "likes"
-  // when done.
+  // This constructor creates an object that will not delete "likes" when done.
   DecodableMatrixScaledMapped(const TransitionModel &tm,
                               const Matrix<BaseFloat> &likes,
                               BaseFloat scale): trans_model_(tm), likes_(&likes),
@@ -55,7 +54,7 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
       KALDI_ERR << "DecodableMatrixScaledMapped: mismatch, matrix has "
                 << likes->NumCols() << " rows but transition-model has "
                 << tm.NumPdfs() << " pdf-ids.";
-  }  
+  }
 
   virtual int32 NumFramesReady() const { return likes_->NumRows(); }
 
@@ -66,7 +65,7 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
 
   // Note, frames are numbered from zero.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return scale_ * (*likes_)(frame, trans_model_.TransitionIdToPdf(tid));
+    return scale_ * (*likes_)(frame, trans_model_.TransitionIdToPdfFast(tid));
   }
 
   // Indices are one-based!  This is for compatibility with OpenFst.
@@ -83,6 +82,59 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixScaledMapped);
 };
 
+/**
+   This is like DecodableMatrixScaledMapped, but it doesn't support an acoustic
+   scale, and it does support a frame offset, whereby you can state that the
+   first row of 'likes' is actually the n'th row of the matrix of available
+   log-likelihoods.  It's useful if the neural net output comes in chunks for
+   different frame ranges.
+
+   Note: DecodableMatrixMappedOffset solves the same problem in a slightly
+   different way, where you use the same decodable object.  This one, unlike
+   DecodableMatrixMappedOffset, is compatible with when the loglikes are in a
+   SubMatrix.
+ */
+class DecodableMatrixMapped: public DecodableInterface {
+ public:
+  // This constructor creates an object that will not delete "likes" when done.
+  // the frame_offset is the frame the row 0 of 'likes' corresponds to, would be
+  // greater than one if this is not the first chunk of likelihoods.
+  DecodableMatrixMapped(const TransitionModel &tm,
+                        const MatrixBase<BaseFloat> &likes,
+                        int32 frame_offset = 0);
+
+  // This constructor creates an object that will delete "likes"
+  // when done.
+  DecodableMatrixMapped(const TransitionModel &tm,
+                        const Matrix<BaseFloat> *likes,
+                        int32 frame_offset = 0);
+
+  virtual int32 NumFramesReady() const;
+
+  virtual bool IsLastFrame(int32 frame) const;
+
+  virtual BaseFloat LogLikelihood(int32 frame, int32 tid);
+
+  // Note: these indices are 1-based.
+  virtual int32 NumIndices() const;
+
+  virtual ~DecodableMatrixMapped();
+
+ private:
+  const TransitionModel &trans_model_;  // for tid to pdf mapping
+  const MatrixBase<BaseFloat> *likes_;
+  const Matrix<BaseFloat> *likes_to_delete_;
+  int32 frame_offset_;
+
+  // raw_data_ and stride_ are a kind of fast look-aside for 'likes_', to be
+  // used when KALDI_PARANOID is false.
+  const BaseFloat *raw_data_;
+  int32 stride_;
+
+  KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixMapped);
+};
+
+
 /**
    This decodable class returns log-likes stored in a matrix; it supports
    repeatedly writing to the matrix and setting a time-offset representing the
@@ -91,68 +143,51 @@ class DecodableMatrixScaledMapped: public DecodableInterface {
    code will call SetLoglikes() each time more log-likelihods are available.
    If you try to access a log-likelihood that's no longer available because
    the frame index is less than the current offset, it is of course an error.
+
+   See also DecodableMatrixMapped, which supports the same type of thing but
+   with a different interface where you are expected to re-construct the
+   object each time you want to decode.
 */
 class DecodableMatrixMappedOffset: public DecodableInterface {
  public:
   DecodableMatrixMappedOffset(const TransitionModel &tm):
-      trans_model_(tm), frame_offset_(0), input_is_finished_(false) { }  
-
-
+      trans_model_(tm), frame_offset_(0), input_is_finished_(false) { }
 
   virtual int32 NumFramesReady() { return frame_offset_ + loglikes_.NumRows(); }
 
   // this is not part of the generic Decodable interface.
   int32 FirstAvailableFrame() { return frame_offset_; }
-  
+
+  // Logically, this function appends 'loglikes' (interpreted as newly available
+  // frames) to the log-likelihoods stored in the class.
+  //
   // This function is destructive of the input "loglikes" because it may
   // under some circumstances do a shallow copy using Swap().  This function
   // appends loglikes to any existing likelihoods you've previously supplied.
-  // frames_to_discard, if nonzero, will discard that number of previously
-  // available frames, from the left, advancing FirstAvailableFrame() by
-  // a number equal to frames_to_discard.  You should only set frames_to_discard
-  // to nonzero if you know your decoder won't want to access the loglikes
-  // for older frames.
   void AcceptLoglikes(Matrix<BaseFloat> *loglikes,
-                      int32 frames_to_discard) {
-    if (loglikes->NumRows() == 0) return;
-    KALDI_ASSERT(loglikes->NumCols() == trans_model_.NumPdfs());
-    KALDI_ASSERT(frames_to_discard <= loglikes_.NumRows() &&
-                 frames_to_discard >= 0);
-    if (frames_to_discard == loglikes_.NumRows()) {
-      loglikes_.Swap(loglikes);
-      loglikes->Resize(0, 0);
-    } else {
-      int32 old_rows_kept = loglikes_.NumRows() - frames_to_discard,
-          new_num_rows = old_rows_kept + loglikes->NumRows();
-      Matrix<BaseFloat> new_loglikes(new_num_rows, loglikes->NumCols());
-      new_loglikes.RowRange(0, old_rows_kept).CopyFromMat(
-          loglikes_.RowRange(frames_to_discard, old_rows_kept));
-      new_loglikes.RowRange(old_rows_kept, loglikes->NumRows()).CopyFromMat(
-          *loglikes);
-      loglikes_.Swap(&new_loglikes);
-    }
-    frame_offset_ += frames_to_discard;
-  }
+                      int32 frames_to_discard);
 
   void InputIsFinished() { input_is_finished_ = true; }
-  
+
   virtual int32 NumFramesReady() const {
     return loglikes_.NumRows() + frame_offset_;
   }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1 && input_is_finished_);
   }
 
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    int32 index = frame - frame_offset_;
-    KALDI_ASSERT(index >= 0 && index < loglikes_.NumRows());
-    return loglikes_(index, trans_model_.TransitionIdToPdf(tid));
+    int32 pdf_id = trans_model_.TransitionIdToPdfFast(tid);
+#ifdef KALDI_PARANOID
+    return loglikes_(frame - frame_offset_, pdf_id);
+#else
+    // This does no checking, so will be faster.
+    return raw_data_[frame * stride_ + pdf_id];
+#endif
   }
 
-                 
-                 
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
 
   // nothing special to do in destructor.
@@ -162,6 +197,15 @@ class DecodableMatrixMappedOffset: public DecodableInterface {
   Matrix<BaseFloat> loglikes_;
   int32 frame_offset_;
   bool input_is_finished_;
+
+  // 'raw_data_' and 'stride_' are intended as a fast look-aside which is an
+  // alternative to accessing data_.  raw_data_ is a faked version of
+  // data_->Data() as if it started from frame zero rather than frame_offset_.
+  // This simplifies the code of LogLikelihood(), in cases where KALDI_PARANOID
+  // is not defined.
+  BaseFloat *raw_data_;
+  int32 stride_;
+
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableMatrixMappedOffset);
 };
 
@@ -171,20 +215,20 @@ class DecodableMatrixScaled: public DecodableInterface {
   DecodableMatrixScaled(const Matrix<BaseFloat> &likes,
                         BaseFloat scale):
     likes_(likes), scale_(scale) { }
-  
+
   virtual int32 NumFramesReady() const { return likes_.NumRows(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
   }
-  
+
   // Note, frames are numbered from zero.
   virtual BaseFloat LogLikelihood(int32 frame, int32 index) {
-    if (index > likes_.NumCols() || index <= 0 || 
+    if (index > likes_.NumCols() || index <= 0 ||
         frame < 0 || frame >= likes_.NumRows())
-      KALDI_ERR << "Invalid (frame, index - 1) = (" 
-                << frame << ", " << index - 1 << ") for matrix of size " 
+      KALDI_ERR << "Invalid (frame, index - 1) = ("
+                << frame << ", " << index - 1 << ") for matrix of size "
                 << likes_.NumRows() << " x " << likes_.NumCols();
     return scale_ * likes_(frame, index - 1);
   }
diff --git a/src/decoder/decoder-wrappers.cc b/src/decoder/decoder-wrappers.cc
index 76f95dab7cc..ff573c74d15 100644
--- a/src/decoder/decoder-wrappers.cc
+++ b/src/decoder/decoder-wrappers.cc
@@ -382,7 +382,7 @@ bool DecodeUtteranceLatticeSimple(
       for (size_t i = 0; i < words.size(); i++) {
         std::string s = word_syms->Find(words[i]);
         if (s == "")
-          KALDI_ERR << "Word-id " << words[i] <<" not in symbol table.";
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
         std::cerr << s << ' ';
       }
       std::cerr << '\n';
diff --git a/src/decoder/grammar-fst.h b/src/decoder/grammar-fst.h
index 70ceadc8daa..f66933c132d 100644
--- a/src/decoder/grammar-fst.h
+++ b/src/decoder/grammar-fst.h
@@ -87,6 +87,9 @@ template<> class ArcIterator<GrammarFst>;
    sub-FSTs that represent nonterminals in the grammar; and multiple return
    points whenever we invoke a nonterminal.  For more information
    see \ref grammar (i.e. ../doc/grammar.dox).
+
+   Caution: this class is not thread safe, i.e. you shouldn't access the same
+   GrammarFst from multiple threads.  We can fix this later if needed.
  */
 class GrammarFst {
  public:
diff --git a/src/feat/Makefile b/src/feat/Makefile
index 2af9da2ec59..dcd029f7f94 100644
--- a/src/feat/Makefile
+++ b/src/feat/Makefile
@@ -16,7 +16,7 @@ OBJFILES = feature-functions.o feature-mfcc.o feature-plp.o feature-fbank.o \
 LIBNAME = kaldi-feat
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 267a4724580..f35cf631752 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -143,7 +143,9 @@ void OnlineCmvnState::Read(std::istream &is, bool binary) {
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
                        const OnlineCmvnState &cmvn_state,
                        OnlineFeatureInterface *src):
-    opts_(opts), src_(src) {
+    opts_(opts), temp_stats_(2, src->Dim() + 1),
+    temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
+    src_(src) {
   SetState(cmvn_state);
   if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
     KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
@@ -151,7 +153,10 @@ OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
 }
 
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
-                       OnlineFeatureInterface *src): opts_(opts), src_(src) {
+                       OnlineFeatureInterface *src):
+    opts_(opts), temp_stats_(2, src->Dim() + 1),
+    temp_feats_(src->Dim()), temp_feats_dbl_(src->Dim()),
+    src_(src) {
   if (!SplitStringToIntegers(opts.skip_dims, ":", false, &skip_dims_))
     KALDI_ERR << "Bad --skip-dims option (should be colon-separated list of "
               <<  "integers)";
@@ -160,7 +165,7 @@ OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
 
 void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
                                           int32 *cached_frame,
-                                          Matrix<double> *stats) {
+                                          MatrixBase<double> *stats) {
   KALDI_ASSERT(frame >= 0);
   InitRingBufferIfNeeded();
   // look for a cached frame on a previous frame as close as possible in time
@@ -174,7 +179,7 @@ void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
     int32 index = t % opts_.ring_buffer_size;
     if (cached_stats_ring_[index].first == t) {
       *cached_frame = t;
-      *stats = cached_stats_ring_[index].second;
+      stats->CopyFromMat(cached_stats_ring_[index].second);
       return;
     }
   }
@@ -182,7 +187,7 @@ void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
   if (n >= cached_stats_modulo_.size()) {
     if (cached_stats_modulo_.size() == 0) {
       *cached_frame = -1;
-      stats->Resize(2, this->Dim() + 1);
+      stats->SetZero();
       return;
     } else {
       n = static_cast<int32>(cached_stats_modulo_.size() - 1);
@@ -190,7 +195,7 @@ void OnlineCmvn::GetMostRecentCachedFrame(int32 frame,
   }
   *cached_frame = n * opts_.modulus;
   KALDI_ASSERT(cached_stats_modulo_[n] != NULL);
-  *stats = *(cached_stats_modulo_[n]);
+  stats->CopyFromMat(*(cached_stats_modulo_[n]));
 }
 
 // Initialize ring buffer for caching stats.
@@ -202,7 +207,7 @@ void OnlineCmvn::InitRingBufferIfNeeded() {
   }
 }
 
-void OnlineCmvn::CacheFrame(int32 frame, const Matrix<double> &stats) {
+void OnlineCmvn::CacheFrame(int32 frame, const MatrixBase<double> &stats) {
   KALDI_ASSERT(frame >= 0);
   if (frame % opts_.modulus == 0) {  // store in cached_stats_modulo_.
     int32 n = frame / opts_.modulus;
@@ -239,18 +244,18 @@ void OnlineCmvn::ComputeStatsForFrame(int32 frame,
   KALDI_ASSERT(frame >= 0 && frame < src_->NumFramesReady());
 
   int32 dim = this->Dim(), cur_frame;
-  Matrix<double> stats(2, dim + 1);
-  GetMostRecentCachedFrame(frame, &cur_frame, &stats);
+  GetMostRecentCachedFrame(frame, &cur_frame, stats_out);
 
-  Vector<BaseFloat> feats(dim);
-  Vector<double> feats_dbl(dim);
+  Vector<BaseFloat> &feats(temp_feats_);
+  Vector<double> &feats_dbl(temp_feats_dbl_);
   while (cur_frame < frame) {
     cur_frame++;
     src_->GetFrame(cur_frame, &feats);
     feats_dbl.CopyFromVec(feats);
-    stats.Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
-    stats.Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
-    stats(0, dim) += 1.0;
+    stats_out->Row(0).Range(0, dim).AddVec(1.0, feats_dbl);
+    if (opts_.normalize_variance)
+      stats_out->Row(1).Range(0, dim).AddVec2(1.0, feats_dbl);
+    (*stats_out)(0, dim) += 1.0;
     // it's a sliding buffer; a frame at the back may be
     // leaving the buffer so we have to subtract that.
     int32 prev_frame = cur_frame - opts_.cmn_window;
@@ -258,13 +263,13 @@ void OnlineCmvn::ComputeStatsForFrame(int32 frame,
       // we need to subtract frame prev_f from the stats.
       src_->GetFrame(prev_frame, &feats);
       feats_dbl.CopyFromVec(feats);
-      stats.Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
-      stats.Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
-      stats(0, dim) -= 1.0;
+      stats_out->Row(0).Range(0, dim).AddVec(-1.0, feats_dbl);
+      if (opts_.normalize_variance)
+        stats_out->Row(1).Range(0, dim).AddVec2(-1.0, feats_dbl);
+      (*stats_out)(0, dim) -= 1.0;
     }
-    CacheFrame(cur_frame, stats);
+    CacheFrame(cur_frame, (*stats_out));
   }
-  stats_out->CopyFromMat(stats);
 }
 
 
@@ -273,6 +278,16 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
                                        const MatrixBase<double> &global_stats,
                                        const OnlineCmvnOptions &opts,
                                        MatrixBase<double> *stats) {
+  if (speaker_stats.NumRows() == 2 && !opts.normalize_variance) {
+    // this is just for efficiency: don't operate on the variance if it's not
+    // needed.
+    int32 cols = speaker_stats.NumCols();  // dim + 1
+    SubMatrix<double> stats_temp(*stats, 0, 1, 0, cols);
+    SmoothOnlineCmvnStats(speaker_stats.RowRange(0, 1),
+                          global_stats.RowRange(0, 1),
+                          opts, &stats_temp);
+    return;
+  }
   int32 dim = stats->NumCols() - 1;
   double cur_count = (*stats)(0, dim);
   // If count exceeded cmn_window it would be an error in how "window_stats"
@@ -311,7 +326,8 @@ void OnlineCmvn::GetFrame(int32 frame,
   src_->GetFrame(frame, feat);
   KALDI_ASSERT(feat->Dim() == this->Dim());
   int32 dim = feat->Dim();
-  Matrix<double> stats(2, dim + 1);
+  Matrix<double> &stats(temp_stats_);
+  stats.Resize(2, dim + 1, kUndefined);  // Will do nothing if size was correct.
   if (frozen_state_.NumRows() != 0) {  // the CMVN state has been frozen.
     stats.CopyFromMat(frozen_state_);
   } else {
@@ -329,14 +345,13 @@ void OnlineCmvn::GetFrame(int32 frame,
 
   // call the function ApplyCmvn declared in ../transform/cmvn.h, which
   // requires a matrix.
-  Matrix<BaseFloat> feat_mat(1, dim);
-  feat_mat.Row(0).CopyFromVec(*feat);
+  // 1 row; num-cols == dim; stride  == dim.
+  SubMatrix<BaseFloat> feat_mat(feat->Data(), 1, dim, dim);
   // the function ApplyCmvn takes a matrix, so form a one-row matrix to give it.
   if (opts_.normalize_mean)
     ApplyCmvn(stats, opts_.normalize_variance, &feat_mat);
   else
     KALDI_ASSERT(!opts_.normalize_variance);
-  feat->CopyFromVec(feat_mat.Row(0));
 }
 
 void OnlineCmvn::Freeze(int32 cur_frame) {
@@ -430,6 +445,17 @@ void OnlineTransform::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   feat->AddMatVec(1.0, linear_term_, kNoTrans, input_feat, 1.0);
 }
 
+void OnlineTransform::GetFrames(
+    const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
+  KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
+  int32 num_frames = feats->NumRows(),
+      input_dim = linear_term_.NumCols();
+  Matrix<BaseFloat> input_feats(num_frames, input_dim, kUndefined);
+  src_->GetFrames(frames, &input_feats);
+  feats->CopyRowsFromVec(offset_);
+  feats->AddMatMat(1.0, input_feats, kNoTrans, linear_term_, kTrans, 1.0);
+}
+
 
 int32 OnlineDeltaFeature::Dim() const {
   int32 src_dim = src_->Dim();
@@ -493,6 +519,44 @@ void OnlineCacheFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   }
 }
 
+void OnlineCacheFeature::GetFrames(
+    const std::vector<int32> &frames, MatrixBase<BaseFloat> *feats) {
+  int32 num_frames = frames.size();
+  // non_cached_frames will be the subset of 't' values in 'frames' which were
+  // not previously cached, which we therefore need to get from src_.
+  std::vector<int32> non_cached_frames;
+  // 'non_cached_indexes' stores the indexes 'i' into 'frames' corresponding to
+  // the corresponding frames in 'non_cached_frames'.
+  std::vector<int32> non_cached_indexes;
+  non_cached_frames.reserve(frames.size());
+  non_cached_indexes.reserve(frames.size());
+  for (int32 i = 0; i < num_frames; i++) {
+    int32 t = frames[i];
+    if (static_cast<size_t>(t) < cache_.size() && cache_[t] != NULL) {
+      feats->Row(i).CopyFromVec(*(cache_[t]));
+    } else {
+      non_cached_frames.push_back(t);
+      non_cached_indexes.push_back(i);
+    }
+  }
+  if (non_cached_frames.empty())
+    return;
+  int32 num_non_cached_frames = non_cached_frames.size(),
+      dim = this->Dim();
+  Matrix<BaseFloat> non_cached_feats(num_non_cached_frames, dim,
+                                     kUndefined);
+  src_->GetFrames(non_cached_frames, &non_cached_feats);
+  for (int32 i = 0; i < num_non_cached_frames; i++) {
+    SubVector<BaseFloat> this_feat(non_cached_feats, i);
+    feats->Row(non_cached_indexes[i]).CopyFromVec(this_feat);
+    int32 t = non_cached_frames[i];
+    if (static_cast<size_t>(t) >= cache_.size())
+      cache_.resize(t + 1, NULL);
+    cache_[t] = new Vector<BaseFloat>(this_feat);
+  }
+}
+
+
 void OnlineCacheFeature::ClearCache() {
   for (size_t i = 0; i < cache_.size(); i++)
     delete cache_[i];
@@ -500,7 +564,6 @@ void OnlineCacheFeature::ClearCache() {
 }
 
 
-
 void OnlineAppendFeature::GetFrame(int32 frame, VectorBase<BaseFloat> *feat) {
   KALDI_ASSERT(feat->Dim() == Dim());
 
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index 11d170972fa..d41bb6747c7 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -182,7 +182,8 @@ struct OnlineCmvnOptions {
                   // class computes the cmvn internally.  smaller->more
                   // time-efficient but less memory-efficient.  Must be >= 1.
   int32 ring_buffer_size;  // not configurable from command line; size of ring
-                           // buffer used for caching CMVN stats.
+                           // buffer used for caching CMVN stats.  Must be >=
+                           // modulus.
   std::string skip_dims; // Colon-separated list of dimensions to skip normalization
                          // of, e.g. 13:14:15.
 
@@ -371,10 +372,10 @@ class OnlineCmvn: public OnlineFeatureInterface {
   /// were cached, sets up empty stats for frame zero and returns that].
   void GetMostRecentCachedFrame(int32 frame,
                                 int32 *cached_frame,
-                                Matrix<double> *stats);
+                                MatrixBase<double> *stats);
 
   /// Cache this frame of stats.
-  void CacheFrame(int32 frame, const Matrix<double> &stats);
+  void CacheFrame(int32 frame, const MatrixBase<double> &stats);
 
   /// Initialize ring buffer for caching stats.
   inline void InitRingBufferIfNeeded();
@@ -403,6 +404,12 @@ class OnlineCmvn: public OnlineFeatureInterface {
   // frame index.
   std::vector<std::pair<int32, Matrix<double> > > cached_stats_ring_;
 
+  // Some temporary variables used inside functions of this class, which
+  // put here to avoid reallocation.
+  Matrix<double> temp_stats_;
+  Vector<BaseFloat> temp_feats_;
+  Vector<double> temp_feats_dbl_;
+
   OnlineFeatureInterface *src_;  // Not owned here
 };
 
@@ -472,6 +479,9 @@ class OnlineTransform: public OnlineFeatureInterface {
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats);
+
   //
   // Next, functions that are not in the interface.
   //
@@ -537,6 +547,9 @@ class OnlineCacheFeature: public OnlineFeatureInterface {
 
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats);
+
   virtual ~OnlineCacheFeature() { ClearCache(); }
 
   // Things that are not in the shared interface:
diff --git a/src/featbin/Makefile b/src/featbin/Makefile
index 8e72d0f744c..861ba3f7a93 100644
--- a/src/featbin/Makefile
+++ b/src/featbin/Makefile
@@ -25,7 +25,7 @@ TESTFILES =
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fgmmbin/Makefile b/src/fgmmbin/Makefile
index baa4cd9be33..5db252477b5 100644
--- a/src/fgmmbin/Makefile
+++ b/src/fgmmbin/Makefile
@@ -18,7 +18,6 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstbin/Makefile b/src/fstbin/Makefile
index 4236282b3fc..a22c014a7d5 100644
--- a/src/fstbin/Makefile
+++ b/src/fstbin/Makefile
@@ -26,6 +26,6 @@ TESTFILES =
 LIBFILE =
 
 ADDLIBS = ../decoder/kaldi-decoder.a ../fstext/kaldi-fstext.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/fstext/Makefile b/src/fstext/Makefile
index dc25ddae95b..b76bd413c42 100644
--- a/src/fstext/Makefile
+++ b/src/fstext/Makefile
@@ -24,7 +24,7 @@ LIBNAME = kaldi-fstext
 
 # tree and matrix archives needed for test-context-fst
 # matrix archive needed for push-special.
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/gmm/Makefile b/src/gmm/Makefile
index d8aedadfd93..caee6734afe 100644
--- a/src/gmm/Makefile
+++ b/src/gmm/Makefile
@@ -14,8 +14,8 @@ OBJFILES = diag-gmm.o diag-gmm-normal.o mle-diag-gmm.o am-diag-gmm.o \
 
 LIBNAME = kaldi-gmm
 
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 
diff --git a/src/gmm/diag-gmm.h b/src/gmm/diag-gmm.h
index 1243d7a6bfd..4a10ea34471 100644
--- a/src/gmm/diag-gmm.h
+++ b/src/gmm/diag-gmm.h
@@ -100,7 +100,7 @@ class DiagGmm {
                                const std::vector<int32> &indices,
                                Vector<BaseFloat> *loglikes) const;
 
-  /// Get gaussian selection information for one frame.  Returns og-like
+  /// Get gaussian selection information for one frame.  Returns log-like
   /// this frame.  Output is the best "num_gselect" indices, sorted from best to
   /// worst likelihood.  If "num_gselect" > NumGauss(), sets it to NumGauss().
   BaseFloat GaussianSelection(const VectorBase<BaseFloat> &data,
diff --git a/src/gmmbin/Makefile b/src/gmmbin/Makefile
index 72a0fa15e73..82d10abe9ce 100644
--- a/src/gmmbin/Makefile
+++ b/src/gmmbin/Makefile
@@ -37,8 +37,8 @@ TESTFILES =
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/hmm/Makefile b/src/hmm/Makefile
index 6da3b7b7757..0ad5da74c28 100644
--- a/src/hmm/Makefile
+++ b/src/hmm/Makefile
@@ -9,8 +9,8 @@ OBJFILES = hmm-topology.o transition-model.o hmm-utils.o tree-accu.o \
         posterior.o hmm-test-utils.o
 
 LIBNAME = kaldi-hmm
-ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/hmm/posterior-test.cc b/src/hmm/posterior-test.cc
index b6958674f9b..0906cb3d0dc 100644
--- a/src/hmm/posterior-test.cc
+++ b/src/hmm/posterior-test.cc
@@ -33,12 +33,12 @@ void TestVectorToPosteriorEntry() {
 
   std::vector<std::pair<int32, BaseFloat> > post_entry;
 
-  BaseFloat ans = VectorToPosteriorEntry(loglikes, gselect, min_post, &post_entry);
+  VectorToPosteriorEntry(loglikes, gselect, min_post, &post_entry);
 
   KALDI_ASSERT(post_entry.size() <= gselect);
 
   int32 max_elem;
-  BaseFloat max_val = loglikes.Max(&max_elem);
+  loglikes.Max(&max_elem);
   KALDI_ASSERT(post_entry[0].first == max_elem);
 
   KALDI_ASSERT(post_entry.back().second >= min_post);
@@ -48,7 +48,6 @@ void TestVectorToPosteriorEntry() {
   for (size_t i = 0; i < post_entry.size(); i++)
     sum += post_entry[i].second;
   KALDI_ASSERT(fabs(sum - 1.0) < 0.01);
-  KALDI_ASSERT(ans >= max_val);
 }
 
 void TestPosteriorIo() {
@@ -92,4 +91,3 @@ int main() {
   }
   std::cout << "Test OK.\n";
 }
-
diff --git a/src/hmm/posterior.cc b/src/hmm/posterior.cc
index 42db6e99cf4..860a979a0ce 100644
--- a/src/hmm/posterior.cc
+++ b/src/hmm/posterior.cc
@@ -402,7 +402,7 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
   for (size_t i = 0; i < post->size(); i++) {
     std::vector<std::pair<int32, BaseFloat> > this_post;
     this_post.reserve((*post)[i].size());
-    BaseFloat sil_weight = 0.0, nonsil_weight = 0.0;   
+    BaseFloat sil_weight = 0.0, nonsil_weight = 0.0;
     for (size_t j = 0; j < (*post)[i].size(); j++) {
       int32 tid = (*post)[i][j].first,
           phone = trans_model.TransitionIdToPhone(tid);
@@ -418,12 +418,23 @@ void WeightSilencePostDistributed(const TransitionModel &trans_model,
     if (frame_scale != 0.0) {
       for (size_t j = 0; j < (*post)[i].size(); j++) {
         int32 tid = (*post)[i][j].first;
-        BaseFloat weight = (*post)[i][j].second;    
+        BaseFloat weight = (*post)[i][j].second;
         this_post.push_back(std::make_pair(tid, weight * frame_scale));
       }
     }
-    (*post)[i].swap(this_post);    
+    (*post)[i].swap(this_post);
+  }
+}
+
+inline static BaseFloat GetTotalPosterior(
+    const std::vector<std::pair<int32, BaseFloat> > &post_entry) {
+  BaseFloat tot = 0.0;
+  std::vector<std::pair<int32, BaseFloat> >::const_iterator
+      iter =  post_entry.begin(), end = post_entry.end();
+  for (; iter != end; ++iter) {
+    tot += iter->second;
   }
+  return tot;
 }
 
 BaseFloat VectorToPosteriorEntry(
@@ -434,39 +445,66 @@ BaseFloat VectorToPosteriorEntry(
   KALDI_ASSERT(num_gselect > 0 && min_post >= 0 && min_post < 1.0);
   // we name num_gauss assuming each entry in log_likes represents a Gaussian;
   // it doesn't matter if they don't.
+
   int32 num_gauss = log_likes.Dim();
   KALDI_ASSERT(num_gauss > 0);
   if (num_gselect > num_gauss)
     num_gselect = num_gauss;
-  Vector<BaseFloat> log_likes_normalized(log_likes);
-  BaseFloat ans = log_likes_normalized.ApplySoftMax();
-  std::vector<std::pair<int32, BaseFloat> > temp_post(num_gauss);
-  for (int32 g = 0; g < num_gauss; g++)
-    temp_post[g] = std::pair<int32, BaseFloat>(g, log_likes_normalized(g));
+  std::vector<std::pair<int32, BaseFloat> > temp_post;
+  BaseFloat max_like = log_likes.Max();
+  if (min_post != 0.0) {
+    BaseFloat like_cutoff = max_like + Log(min_post);
+    for (int32 g = 0; g < num_gauss; g++) {
+      BaseFloat like = log_likes(g);
+      if (like > like_cutoff) {
+        BaseFloat post = exp(like - max_like);
+        temp_post.push_back(std::pair<int32, BaseFloat>(g, post));
+      }
+    }
+  }
+  if (temp_post.empty()) {
+    // we reach here if min_post was 0.0 or if no posteriors reached the
+    // threshold min_post (we need at least one).
+    temp_post.resize(num_gauss);
+    for (int32 g = 0; g < num_gauss; g++)
+      temp_post[g] = std::pair<int32, BaseFloat>(g, Exp(log_likes(g) - max_like));
+  }
+
   CompareReverseSecond compare;
-  // Sort in decreasing order on posterior.  For efficiency we
-  // first do nth_element and then sort, as we only need the part we're
-  // going to output, to be sorted.
-  std::nth_element(temp_post.begin(),
-                   temp_post.begin() + num_gselect, temp_post.end(),
-                   compare);
-  std::sort(temp_post.begin(), temp_post.begin() + num_gselect,
-            compare);
+  if (static_cast<int32>(temp_post.size()) > num_gselect * 2) {
+    // Sort in decreasing order on posterior.  For efficiency we
+    // first do nth_element and then sort, as we only need the part we're
+    // going to output, to be sorted.
+    std::nth_element(temp_post.begin(),
+                     temp_post.begin() + num_gselect, temp_post.end(),
+                     compare);
+    std::sort(temp_post.begin(), temp_post.begin() + num_gselect,
+              compare);
+  } else {
+    std::sort(temp_post.begin(), temp_post.end(), compare);
+  }
+
+  size_t num_to_insert = std::min<size_t>(temp_post.size(),
+                                          num_gselect);
 
   post_entry->clear();
   post_entry->insert(post_entry->end(),
-                     temp_post.begin(), temp_post.begin() + num_gselect);
-  while (post_entry->size() > 1 && post_entry->back().second < min_post)
-    post_entry->pop_back();  
+                     temp_post.begin(), temp_post.begin() + num_to_insert);
+
+  BaseFloat tot_post = GetTotalPosterior(*post_entry),
+      cutoff = min_post * tot_post;
+
+  while (post_entry->size() > 1 && post_entry->back().second < cutoff) {
+    tot_post -= post_entry->back().second;
+    post_entry->pop_back();
+  }
   // Now renormalize to sum to one after pruning.
-  BaseFloat tot = 0.0;
-  size_t size = post_entry->size();
-  for (size_t i = 0; i < size; i++)
-    tot += (*post_entry)[i].second;
-  BaseFloat inv_tot = 1.0 / tot;
-  for (size_t i = 0; i < size; i++)
-    (*post_entry)[i].second *= inv_tot;
-  return ans;
+  BaseFloat inv_tot = 1.0 / tot_post;
+  auto end = post_entry->end();
+  for (auto iter = post_entry->begin(); iter != end; ++iter)
+    iter->second *= inv_tot;
+
+  return max_like + log(tot_post);
 }
 
 
diff --git a/src/hmm/posterior.h b/src/hmm/posterior.h
index cfe3fc44572..0c255845dd5 100644
--- a/src/hmm/posterior.h
+++ b/src/hmm/posterior.h
@@ -190,8 +190,9 @@ struct CompareReverseSecond {
 /// by applying Softmax(), then prunes the posteriors using "gselect" and
 /// "min_post" (keeping at least one), and outputs the result into
 /// "post_entry", sorted from greatest to least posterior.
-/// Returns the total log-likelihood (the output of calling ApplySoftMax()
-/// on a copy of log_likes).
+///
+/// It returns the log of the sum of the selected log-likes that contributed
+/// to the posterior.
 BaseFloat VectorToPosteriorEntry(
     const VectorBase<BaseFloat> &log_likes,
     int32 num_gselect,
diff --git a/src/hmm/transition-model.cc b/src/hmm/transition-model.cc
index 83edbaf5805..5ecb7776f00 100644
--- a/src/hmm/transition-model.cc
+++ b/src/hmm/transition-model.cc
@@ -166,7 +166,7 @@ void TransitionModel::ComputeDerived() {
 
   id2state_.resize(cur_transition_id);   // cur_transition_id is #transition-ids+1.
   id2pdf_id_.resize(cur_transition_id);
-  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++)
+  for (int32 tstate = 1; tstate <= static_cast<int32>(tuples_.size()); tstate++) {
     for (int32 tid = state2id_[tstate]; tid < state2id_[tstate+1]; tid++) {
       id2state_[tid] = tstate;
       if (IsSelfLoop(tid))
@@ -174,6 +174,17 @@ void TransitionModel::ComputeDerived() {
       else
         id2pdf_id_[tid] = tuples_[tstate-1].forward_pdf;
     }
+  }
+
+  // The following statements put copies a large number in the region of memory
+  // past the end of the id2pdf_id_ array, while leaving the aray as it was
+  // before.  The goal of this is to speed up decoding by disabling a check
+  // inside TransitionIdToPdf() that the transition-id was within the correct
+  // range.
+  int32 num_big_numbers = std::min<int32>(2000, cur_transition_id);
+  id2pdf_id_.resize(cur_transition_id + num_big_numbers,
+                    std::numeric_limits<int32>::max());
+  id2pdf_id_.resize(cur_transition_id);
 }
 
 void TransitionModel::InitializeProbs() {
diff --git a/src/hmm/transition-model.h b/src/hmm/transition-model.h
index 9843dff946b..f03b54e8b71 100644
--- a/src/hmm/transition-model.h
+++ b/src/hmm/transition-model.h
@@ -156,6 +156,10 @@ class TransitionModel {
   // this state doesn't have a self-loop.
 
   inline int32 TransitionIdToPdf(int32 trans_id) const;
+  // TransitionIdToPdfFast is as TransitionIdToPdf but skips an assertion
+  // (unless we're in paranoid mode).
+  inline int32 TransitionIdToPdfFast(int32 trans_id) const;
+
   int32 TransitionIdToPhone(int32 trans_id) const;
   int32 TransitionIdToPdfClass(int32 trans_id) const;
   int32 TransitionIdToHmmState(int32 trans_id) const;
@@ -316,14 +320,26 @@ class TransitionModel {
   /// of pdfs).
   int32 num_pdfs_;
 
-
   KALDI_DISALLOW_COPY_AND_ASSIGN(TransitionModel);
-
 };
 
 inline int32 TransitionModel::TransitionIdToPdf(int32 trans_id) const {
-  KALDI_ASSERT(static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
-               "Likely graph/model mismatch (graph built from wrong model?)");
+  KALDI_ASSERT(
+      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
+      "Likely graph/model mismatch (graph built from wrong model?)");
+  return id2pdf_id_[trans_id];
+}
+
+inline int32 TransitionModel::TransitionIdToPdfFast(int32 trans_id) const {
+  // Note: it's a little dangerous to assert this only in paranoid mode.
+  // However, this function is called in the inner loop of decoders and
+  // the assertion likely takes a significant amount of time.  We make
+  // sure that past the end of thd id2pdf_id_ array there are big
+  // numbers, which will make the calling code more likely to segfault
+  // (rather than silently die) if this is called for out-of-range values.
+  KALDI_PARANOID_ASSERT(
+      static_cast<size_t>(trans_id) < id2pdf_id_.size() &&
+      "Likely graph/model mismatch (graph built from wrong model?)");
   return id2pdf_id_[trans_id];
 }
 
diff --git a/src/itf/decodable-itf.h b/src/itf/decodable-itf.h
index 9852861969d..9f1f2f62e2b 100644
--- a/src/itf/decodable-itf.h
+++ b/src/itf/decodable-itf.h
@@ -72,19 +72,18 @@ namespace kaldi {
    always just return the number of frames in the file, and IsLastFrame() will
    return true for the last frame.
 
-   For truly online decoding, the "old" online decodable objects in ../online/ have a
-   "blocking" IsLastFrame() and will crash if you call NumFramesReady().
+   For truly online decoding, the "old" online decodable objects in ../online/
+   have a "blocking" IsLastFrame() and will crash if you call NumFramesReady().
    The "new" online decodable objects in ../online2/ return the number of frames
    currently accessible if you call NumFramesReady().  You will likely not need
    to call IsLastFrame(), but we implement it to only return true for the last
    frame of the file once we've decided to terminate decoding.
 */
-
 class DecodableInterface {
  public:
   /// Returns the log likelihood, which will be negated in the decoder.
-  /// The "frame" starts from zero.  You should verify that IsLastFrame(frame-1)
-  /// returns false before calling this.
+  /// The "frame" starts from zero.  You should verify that NumFramesReady() > frame
+  /// before calling this.
   virtual BaseFloat LogLikelihood(int32 frame, int32 index) = 0;
 
   /// Returns true if this is the last frame.  Frames are zero-based, so the
diff --git a/src/itf/online-feature-itf.h b/src/itf/online-feature-itf.h
index 3837024ab55..22c1c392450 100644
--- a/src/itf/online-feature-itf.h
+++ b/src/itf/online-feature-itf.h
@@ -45,11 +45,11 @@ namespace kaldi {
    implementing a child class you must not make assumptions about the
    order in which the user makes these calls.
 */
-   
+
 class OnlineFeatureInterface {
  public:
   virtual int32 Dim() const = 0; /// returns the feature dimension.
-  
+
   /// Returns the total number of frames, since the start of the utterance, that
   /// are now available.  In an online-decoding context, this will likely
   /// increase with time as more data becomes available.
@@ -65,7 +65,7 @@ class OnlineFeatureInterface {
   /// many frames are in the decodable object (as it used to be, and for backward
   /// compatibility, still is, in the Decodable interface).
   virtual bool IsLastFrame(int32 frame) const = 0;
-  
+
   /// Gets the feature vector for this frame.  Before calling this for a given
   /// frame, it is assumed that you called NumFramesReady() and it returned a
   /// number greater than "frame".  Otherwise this call will likely crash with
@@ -74,6 +74,21 @@ class OnlineFeatureInterface {
   /// the class.
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat) = 0;
 
+
+  /// This is like GetFrame() but for a collection of frames.  There is a
+  /// default implementation that just gets the frames one by one, but it
+  /// may be overridden for efficiency by child classes (since sometimes
+  /// it's more efficient to do things in a batch).
+  virtual void GetFrames(const std::vector<int32> &frames,
+                         MatrixBase<BaseFloat> *feats) {
+    KALDI_ASSERT(static_cast<int32>(frames.size()) == feats->NumRows());
+    for (size_t i = 0; i < frames.size(); i++) {
+      SubVector<BaseFloat> feat(*feats, i);
+      GetFrame(frames[i], &feat);
+    }
+  }
+
+
   // Returns frame shift in seconds.  Helps to estimate duration from frame
   // counts.
   virtual BaseFloat FrameShiftInSeconds() const = 0;
@@ -81,8 +96,8 @@ class OnlineFeatureInterface {
   /// Virtual destructor.  Note: constructors that take another member of
   /// type OnlineFeatureInterface are not expected to take ownership of
   /// that pointer; the caller needs to keep track of that manually.
-  virtual ~OnlineFeatureInterface() { }  
-  
+  virtual ~OnlineFeatureInterface() { }
+
 };
 
 
diff --git a/src/ivector/Makefile b/src/ivector/Makefile
index 408018befa4..1154da6880b 100644
--- a/src/ivector/Makefile
+++ b/src/ivector/Makefile
@@ -13,8 +13,8 @@ OBJFILES = ivector-extractor.o voice-activity-detection.o plda.o \
 LIBNAME = kaldi-ivector
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/ivector/ivector-extractor.cc b/src/ivector/ivector-extractor.cc
index aaba3837698..c3a122281c2 100644
--- a/src/ivector/ivector-extractor.cc
+++ b/src/ivector/ivector-extractor.cc
@@ -578,10 +578,96 @@ void OnlineIvectorEstimationStats::AccStats(
       quadratic_term_.AddToDiag(prior_scale_change);
     }
   }
+  num_frames_ += tot_weight;
+}
+
+
+// This is used in OnlineIvectorEstimationStats::AccStats().
+struct GaussInfo {
+  // total weight for this Gaussian.
+  BaseFloat tot_weight;
+  // vector of pairs of (frame-index, weight for this Gaussian)
+  std::vector<std::pair<int32, BaseFloat> > frame_weights;
+  GaussInfo(): tot_weight(0.0) { }
+};
+
+static void ConvertPostToGaussInfo(
+    const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post,
+    std::unordered_map<int32, GaussInfo> *gauss_info) {
+  int32 num_frames = gauss_post.size();
+  for (int32 t = 0; t < num_frames; t++) {
+    const std::vector<std::pair<int32, BaseFloat> > &this_post = gauss_post[t];
+    auto iter = this_post.begin(), end = this_post.end();
+    for (; iter != end; ++iter) {
+      int32 gauss_idx = iter->first;
+      GaussInfo &info = (*gauss_info)[gauss_idx];
+      BaseFloat weight = iter->second;
+      info.tot_weight += weight;
+      info.frame_weights.push_back(std::pair<int32, BaseFloat>(t, weight));
+    }
+  }
+}
+
+void OnlineIvectorEstimationStats::AccStats(
+    const IvectorExtractor &extractor,
+    const MatrixBase<BaseFloat> &features,
+    const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post) {
+  KALDI_ASSERT(extractor.IvectorDim() == this->IvectorDim());
+  KALDI_ASSERT(!extractor.IvectorDependentWeights());
+
+  int32 feat_dim = features.NumCols();
+  std::unordered_map<int32, GaussInfo> gauss_info;
+  ConvertPostToGaussInfo(gauss_post, &gauss_info);
+
+  Vector<double> weighted_feats(feat_dim, kUndefined);
+  double tot_weight = 0.0;
+  int32 ivector_dim = this->IvectorDim(),
+      quadratic_term_dim = (ivector_dim * (ivector_dim + 1)) / 2;
+  SubVector<double> quadratic_term_vec(quadratic_term_.Data(),
+                                       quadratic_term_dim);
+
+  std::unordered_map<int32, GaussInfo>::const_iterator
+      iter = gauss_info.begin(), end = gauss_info.end();
+  for (; iter != end; ++iter) {
+    int32 gauss_idx = iter->first;
+    const GaussInfo &info = iter->second;
+
+    weighted_feats.SetZero();
+    std::vector<std::pair<int32, BaseFloat> >::const_iterator
+        f_iter = info.frame_weights.begin(), f_end = info.frame_weights.end();
+    for (; f_iter != f_end; ++f_iter) {
+      int32 t = f_iter->first;
+      BaseFloat weight = f_iter->second;
+      weighted_feats.AddVec(weight, features.Row(t));
+    }
+    BaseFloat this_tot_weight = info.tot_weight;
 
+    linear_term_.AddMatVec(1.0, extractor.Sigma_inv_M_[gauss_idx], kTrans,
+                           weighted_feats, 1.0);
+    SubVector<double> U_g(extractor.U_, gauss_idx);
+    quadratic_term_vec.AddVec(this_tot_weight, U_g);
+    tot_weight += this_tot_weight;
+  }
+  if (max_count_ > 0.0) {
+    // see comments in header RE max_count for explanation.  It relates to
+    // prior scaling when the count exceeds max_count_
+    double old_num_frames = num_frames_,
+        new_num_frames = num_frames_ + tot_weight;
+    double old_prior_scale = std::max(old_num_frames, max_count_) / max_count_,
+        new_prior_scale = std::max(new_num_frames, max_count_) / max_count_;
+    // The prior_scales are the inverses of the scales we would put on the stats
+    // if we were implementing this by scaling the stats.  Instead we
+    // scale the prior term.
+    double prior_scale_change = new_prior_scale - old_prior_scale;
+    if (prior_scale_change != 0.0) {
+      linear_term_(0) += prior_offset_ * prior_scale_change;
+      quadratic_term_.AddToDiag(prior_scale_change);
+    }
+  }
   num_frames_ += tot_weight;
 }
 
+
 void OnlineIvectorEstimationStats::Scale(double scale) {
   KALDI_ASSERT(scale >= 0.0 && scale <= 1.0);
   double old_num_frames = num_frames_;
diff --git a/src/ivector/ivector-extractor.h b/src/ivector/ivector-extractor.h
index 9641d9d79e8..3b9b6f3eb5c 100644
--- a/src/ivector/ivector-extractor.h
+++ b/src/ivector/ivector-extractor.h
@@ -323,10 +323,17 @@ class OnlineIvectorEstimationStats {
   OnlineIvectorEstimationStats(const OnlineIvectorEstimationStats &other);
 
 
+  // Accumulate stats for one frame.
   void AccStats(const IvectorExtractor &extractor,
                 const VectorBase<BaseFloat> &feature,
                 const std::vector<std::pair<int32, BaseFloat> > &gauss_post);
 
+  // Accumulate stats for a sequence (or collection) of frames.
+  void AccStats(const IvectorExtractor &extractor,
+                const MatrixBase<BaseFloat> &features,
+                const std::vector<std::vector<std::pair<int32, BaseFloat> > > &gauss_post);
+
+
   int32 IvectorDim() const { return linear_term_.Dim(); }
 
   /// This function gets the current estimate of the iVector.  Internally it
diff --git a/src/ivectorbin/Makefile b/src/ivectorbin/Makefile
index 75a17708c43..5a738352d9c 100644
--- a/src/ivectorbin/Makefile
+++ b/src/ivectorbin/Makefile
@@ -26,7 +26,7 @@ TESTFILES =
 
 
 ADDLIBS = ../ivector/kaldi-ivector.a ../hmm/kaldi-hmm.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kws/Makefile b/src/kws/Makefile
index a5b74ea2229..c4367eb2958 100644
--- a/src/kws/Makefile
+++ b/src/kws/Makefile
@@ -10,8 +10,7 @@ OBJFILES = kws-functions.o kws-functions2.o kws-scoring.o
 LIBNAME = kaldi-kws
 
 ADDLIBS = ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/kwsbin/Makefile b/src/kwsbin/Makefile
index cade044e153..bcc2685b7f3 100644
--- a/src/kwsbin/Makefile
+++ b/src/kwsbin/Makefile
@@ -17,7 +17,6 @@ TESTFILES =
 
 ADDLIBS = ../kws/kaldi-kws.a ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lat/Makefile b/src/lat/Makefile
index bba2329fdf6..56521486826 100644
--- a/src/lat/Makefile
+++ b/src/lat/Makefile
@@ -16,8 +16,7 @@ OBJFILES = kaldi-lattice.o lattice-functions.o word-align-lattice.o \
 LIBNAME = kaldi-lat
 
 ADDLIBS = ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 
 include ../makefiles/default_rules.mk
diff --git a/src/latbin/Makefile b/src/latbin/Makefile
index afff54cb845..9809cdcbb85 100644
--- a/src/latbin/Makefile
+++ b/src/latbin/Makefile
@@ -32,10 +32,9 @@ OBJFILES =
 
 TESTFILES =
 
-ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../lat/kaldi-lat.a ../nnet3/kaldi-nnet3.a ../lm/kaldi-lm.a \
+ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
+          ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a ../lm/kaldi-lm.a \
           ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../cudamatrix/kaldi-cudamatrix.a ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lm/Makefile b/src/lm/Makefile
index 3dfb409f970..c0654fa83b2 100644
--- a/src/lm/Makefile
+++ b/src/lm/Makefile
@@ -12,7 +12,6 @@ OBJFILES = arpa-file-parser.o arpa-lm-compiler.o const-arpa-lm.o \
 LIBNAME = kaldi-lm
 
 ADDLIBS = ../fstext/kaldi-fstext.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/lmbin/Makefile b/src/lmbin/Makefile
index c88f6151a8f..108ddab50c5 100644
--- a/src/lmbin/Makefile
+++ b/src/lmbin/Makefile
@@ -10,7 +10,7 @@ OBJFILES =
 
 TESTFILES =
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/matrix/kaldi-vector.h b/src/matrix/kaldi-vector.h
index 3eb4a932095..383d8ca2862 100644
--- a/src/matrix/kaldi-vector.h
+++ b/src/matrix/kaldi-vector.h
@@ -514,8 +514,9 @@ class SubVector : public VectorBase<Real> {
 
   /// Constructor from a pointer to memory and a length.  Keeps a pointer
   /// to the data but does not take ownership (will never delete).
-  SubVector(Real *data, MatrixIndexT length) : VectorBase<Real> () {
-    VectorBase<Real>::data_ = data;
+  /// Caution: this constructor enables you to evade const constraints.
+  SubVector(const Real *data, MatrixIndexT length) : VectorBase<Real> () {
+    VectorBase<Real>::data_ = const_cast<Real*>(data);
     VectorBase<Real>::dim_   = length;
   }
 
@@ -594,4 +595,3 @@ Real VecMatVec(const VectorBase<Real> &v1, const MatrixBase<Real> &M,
 
 
 #endif  // KALDI_MATRIX_KALDI_VECTOR_H_
-
diff --git a/src/nnet/Makefile b/src/nnet/Makefile
index 99f54ae2af2..7f324479a0f 100644
--- a/src/nnet/Makefile
+++ b/src/nnet/Makefile
@@ -15,8 +15,8 @@ OBJFILES = nnet-nnet.o nnet-component.o nnet-loss.o \
 LIBNAME = kaldi-nnet
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../hmm/kaldi-hmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/nnet2/Makefile b/src/nnet2/Makefile
index 5fc27419ec1..7c19ec2603c 100644
--- a/src/nnet2/Makefile
+++ b/src/nnet2/Makefile
@@ -27,7 +27,7 @@ LIBNAME = kaldi-nnet2
 
 ADDLIBS = ../cudamatrix/kaldi-cudamatrix.a ../lat/kaldi-lat.a \
           ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet2/decodable-am-nnet.h b/src/nnet2/decodable-am-nnet.h
index e3dedb33727..6c40b11bf9d 100644
--- a/src/nnet2/decodable-am-nnet.h
+++ b/src/nnet2/decodable-am-nnet.h
@@ -76,14 +76,14 @@ class DecodableAmNnet: public DecodableInterface {
   // from one (this routine is called by FSTs).
   virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
     return log_probs_(frame,
-                      trans_model_.TransitionIdToPdf(transition_id));
+                      trans_model_.TransitionIdToPdfFast(transition_id));
   }
 
   virtual int32 NumFramesReady() const { return log_probs_.NumRows(); }
-  
+
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
@@ -139,7 +139,7 @@ class DecodableAmNnetParallel: public DecodableInterface {
   virtual BaseFloat LogLikelihood(int32 frame, int32 transition_id) {
     if (feats_) Compute(); // this function sets feats_ to NULL.
     return log_probs_(frame,
-                      trans_model_.TransitionIdToPdf(transition_id));
+                      trans_model_.TransitionIdToPdfFast(transition_id));
   }
 
   int32 NumFramesReady() const {
@@ -155,10 +155,10 @@ class DecodableAmNnetParallel: public DecodableInterface {
       return log_probs_.NumRows();
     }
   }
-  
+
   // Indices are one-based!  This is for compatibility with OpenFst.
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
@@ -180,7 +180,7 @@ class DecodableAmNnetParallel: public DecodableInterface {
 
 
 
-  
+
 } // namespace nnet2
 } // namespace kaldi
 
diff --git a/src/nnet2bin/Makefile b/src/nnet2bin/Makefile
index 3280acfc968..b7e2c385006 100644
--- a/src/nnet2bin/Makefile
+++ b/src/nnet2bin/Makefile
@@ -38,7 +38,7 @@ ADDLIBS = ../nnet2/kaldi-nnet2.a ../nnet/kaldi-nnet.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/Makefile b/src/nnet3/Makefile
index 135853cadc3..aac16fb1c86 100644
--- a/src/nnet3/Makefile
+++ b/src/nnet3/Makefile
@@ -31,15 +31,16 @@ OBJFILES = nnet-common.o nnet-compile.o nnet-component-itf.o \
   nnet-compile-looped.o decodable-simple-looped.o \
   decodable-online-looped.o convolution.o \
   nnet-convolutional-component.o attention.o \
-  nnet-attention-component.o nnet-tdnn-component.o
+  nnet-attention-component.o nnet-tdnn-component.o nnet-batch-compute.o
 
 
 LIBNAME = kaldi-nnet3
 
 ADDLIBS = ../chain/kaldi-chain.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
+          ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
+          ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
index f231a2d5b62..5817df5fd25 100644
--- a/src/nnet3/decodable-online-looped.cc
+++ b/src/nnet3/decodable-online-looped.cc
@@ -244,7 +244,7 @@ BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
   EnsureFrameIsComputed(subsampled_frame);
   return current_log_post_(
       subsampled_frame - current_log_post_subsampled_offset_,
-      trans_model_.TransitionIdToPdf(index));
+      trans_model_.TransitionIdToPdfFast(index));
 }
 
 
diff --git a/src/nnet3/decodable-simple-looped.cc b/src/nnet3/decodable-simple-looped.cc
index d4edb440d5a..0452304cf55 100644
--- a/src/nnet3/decodable-simple-looped.cc
+++ b/src/nnet3/decodable-simple-looped.cc
@@ -257,7 +257,7 @@ DecodableAmNnetSimpleLooped::DecodableAmNnetSimpleLooped(
 
 BaseFloat DecodableAmNnetSimpleLooped::LogLikelihood(int32 frame,
                                                      int32 transition_id) {
-  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(transition_id);
   return decodable_nnet_.GetOutput(frame, pdf_id);
 }
 
diff --git a/src/nnet3/nnet-am-decodable-simple.cc b/src/nnet3/nnet-am-decodable-simple.cc
index d66e24830c6..9682bd96bc7 100644
--- a/src/nnet3/nnet-am-decodable-simple.cc
+++ b/src/nnet3/nnet-am-decodable-simple.cc
@@ -77,7 +77,7 @@ DecodableAmNnetSimple::DecodableAmNnetSimple(
 
 BaseFloat DecodableAmNnetSimple::LogLikelihood(int32 frame,
                                                int32 transition_id) {
-  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(transition_id);
   return decodable_nnet_.GetOutput(frame, pdf_id);
 }
 
@@ -204,7 +204,7 @@ void DecodableNnetSimple::GetCurrentIvector(int32 output_t_start,
                 << ", only available till frame "
                 << online_ivector_feats_->NumRows()
                 << " * ivector-period=" << online_ivector_period_
-                << " (mismatched --ivector-period?)";
+                << " (mismatched --online-ivector-period?)";
     }
     ivector_frame = online_ivector_feats_->NumRows() - 1;
   }
@@ -357,7 +357,7 @@ void DecodableAmNnetSimpleParallel::DeletePointers() {
 
 BaseFloat DecodableAmNnetSimpleParallel::LogLikelihood(int32 frame,
                                                        int32 transition_id) {
-  int32 pdf_id = trans_model_.TransitionIdToPdf(transition_id);
+  int32 pdf_id = trans_model_.TransitionIdToPdfFast(transition_id);
   return decodable_nnet_->GetOutput(frame, pdf_id);
 }
 
diff --git a/src/nnet3/nnet-batch-compute.cc b/src/nnet3/nnet-batch-compute.cc
new file mode 100644
index 00000000000..6db046796be
--- /dev/null
+++ b/src/nnet3/nnet-batch-compute.cc
@@ -0,0 +1,1313 @@
+// nnet3/nnet-batch-compute.cc
+
+// Copyright 2012-2018  Johns Hopkins University (author: Daniel Povey)
+//           2018       Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <iomanip>
+#include "nnet3/nnet-batch-compute.h"
+#include "nnet3/nnet-utils.h"
+#include "decoder/decodable-matrix.h"
+
+namespace kaldi {
+namespace nnet3 {
+
+
+NnetBatchComputer::NnetBatchComputer(
+    const NnetBatchComputerOptions &opts,
+    const Nnet &nnet,
+    const VectorBase<BaseFloat> &priors):
+    opts_(opts),
+    nnet_(nnet),
+    compiler_(nnet_, opts.optimize_config),
+    log_priors_(priors),
+    num_full_minibatches_(0) {
+  log_priors_.ApplyLog();
+  CheckAndFixConfigs();
+  ComputeSimpleNnetContext(nnet, &nnet_left_context_,
+                           &nnet_right_context_);
+  input_dim_ = nnet.InputDim("input");
+  ivector_dim_ = std::max<int32>(0, nnet.InputDim("ivector"));
+  output_dim_ = nnet.OutputDim("output");
+  KALDI_ASSERT(input_dim_ > 0 && output_dim_ > 0);
+}
+
+void NnetBatchComputer::PrintMinibatchStats() {
+  int32 max_stats_to_print = 10;
+  int64 tot_tasks = 0, tot_minibatches = 0;
+  double tot_time = 0.0;
+  std::ostringstream os;
+  struct MinibatchStats {
+    int32 num_frames_out;
+    int32 num_frames_in;
+    int32 minibatch_size;
+    int32 num_done;
+    int32 percent_full;
+    BaseFloat seconds_taken;
+
+    bool operator < (const MinibatchStats &other) const {
+      return seconds_taken > other.seconds_taken;  // sort from most to least time.
+    }
+  };
+  std::vector<MinibatchStats> all_stats;
+  os << "Minibatch stats: seconds-taken,frames-in:frames-out*minibatch-size=num-done(percent-full%)  ";
+
+  for (MapType::const_iterator iter = tasks_.begin();
+       iter != tasks_.end(); ++iter) {
+    for (std::map<int32, MinibatchSizeInfo>::const_iterator
+             miter = iter->second.minibatch_info.begin();
+         miter != iter->second.minibatch_info.end(); ++miter) {
+      const ComputationGroupKey &key = iter->first;
+      const MinibatchSizeInfo &minfo = miter->second;
+      MinibatchStats stats;
+      stats.num_frames_in = key.num_input_frames;
+      stats.num_frames_out = key.num_output_frames;
+      stats.minibatch_size = miter->first;
+      stats.num_done = minfo.num_done;
+      stats.seconds_taken = minfo.seconds_taken;
+
+      tot_tasks += minfo.tot_num_tasks;
+      tot_minibatches += minfo.num_done;
+      tot_time += minfo.seconds_taken;
+      stats.percent_full = int32(minfo.tot_num_tasks * 100.0 /
+                                 (stats.minibatch_size * stats.num_done));
+      all_stats.push_back(stats);
+    }
+  }
+
+  std::sort(all_stats.begin(), all_stats.end());
+  os << std::fixed << std::setprecision(2);
+  int32 num_stats = all_stats.size();
+  for (int32 i = 0; i < std::min<int32>(num_stats, max_stats_to_print); i++) {
+    MinibatchStats &stats = all_stats[i];
+    os << stats.seconds_taken << ',' << stats.num_frames_in << ':'
+       << stats.num_frames_out << '*' << stats.minibatch_size
+       << '=' << stats.num_done << '(' << stats.percent_full << "%) ";
+  }
+  if (num_stats > max_stats_to_print)
+    os << "...";
+  KALDI_LOG << os.str();
+  KALDI_LOG << "Did " << tot_tasks << " tasks in " << tot_minibatches
+            << " minibatches, taking " << tot_time << " seconds.";
+}
+
+NnetBatchComputer::~NnetBatchComputer() {
+  PrintMinibatchStats();
+  // the destructor shouldn't be called while the mutex is locked; if it is, it
+  // likely means the program has already crashed, or it's a programming error.
+  if (!mutex_.try_lock())
+    KALDI_ERR << "Destructor called while object locked.";
+  int32 num_pending_tasks = 0;
+  for (auto iter = tasks_.begin(); iter != tasks_.end(); ++iter)
+    num_pending_tasks += iter->second.tasks.size();
+  if (num_pending_tasks > 0)
+    KALDI_ERR << "Tasks are pending but object is being destroyed";
+  for (auto iter = no_more_than_n_minibatches_full_.begin();
+       iter != no_more_than_n_minibatches_full_.end(); ++iter) {
+    std::condition_variable *cond = iter->second;
+    // the next call will notify any threads that were waiting on this condition
+    // variable-- there shouldn't be any, though, as it would be a programming
+    // error, but better to wake them up so we can see any messages they print.
+    cond->notify_all();
+    delete cond;
+  }
+  KALDI_ASSERT(num_full_minibatches_ == 0);  // failure would be a coding error.
+}
+
+NnetBatchComputer::MinibatchSizeInfo*
+NnetBatchComputer::GetHighestPriorityComputation(
+    bool allow_partial_minibatch,
+    int32 *minibatch_size_out,
+    std::vector<NnetInferenceTask*> *tasks) {
+  tasks->clear();
+  std::unique_lock<std::mutex>(mutex_);
+  MapType::iterator iter = tasks_.begin(), end = tasks_.end(),
+      best_iter = tasks_.end();
+  double highest_priority = -std::numeric_limits<double>::infinity();
+
+  for (; iter != end; ++iter) {
+    ComputationGroupInfo &info = iter->second;
+    double this_priority = GetPriority(allow_partial_minibatch, info);
+    if (this_priority > highest_priority) {
+      highest_priority = this_priority;
+      best_iter = iter;
+    }
+  }
+  if (best_iter == tasks_.end()) {
+    // either allow_partial_minibatch == false and there were no full
+    // minibatches, or there were no pending tasks at all.
+    return NULL;
+  }
+  ComputationGroupInfo &info = best_iter->second;
+  int32 actual_minibatch_size = GetActualMinibatchSize(info);
+  *minibatch_size_out = actual_minibatch_size;
+  MinibatchSizeInfo *minfo = &(info.minibatch_info[actual_minibatch_size]);
+  if (minfo->computation == NULL)
+    minfo->computation = GetComputation(info, actual_minibatch_size);
+  GetHighestPriorityTasks(actual_minibatch_size, &info, tasks);
+  return minfo;
+}
+
+
+void NnetBatchComputer::GetHighestPriorityTasks(
+    int32 num_tasks_needed,
+    ComputationGroupInfo *info,
+    std::vector<NnetInferenceTask*> *tasks) {
+  int32 num_tasks_present = info->tasks.size(),
+      minibatch_size = GetMinibatchSize(*info);
+  KALDI_ASSERT(tasks->empty());
+  if (num_tasks_needed >= num_tasks_present) {
+    tasks->swap(info->tasks);
+  } else {
+    int32 num_tasks_not_needed = num_tasks_present - num_tasks_needed;
+    // We don't sort the tasks with a comparator that dereferences the pointers,
+    // because the priorities can change asynchronously, and we're concerned that
+    // something weird might happen in the sorting if the things it's comparing
+    // are changing.
+    std::vector<std::pair<double, NnetInferenceTask*> > pairs(num_tasks_present);
+    for (int32 i = 0; i < num_tasks_present; i++) {
+      pairs[i].first = info->tasks[i]->priority;
+      pairs[i].second = info->tasks[i];
+    }
+    std::nth_element(pairs.begin(), pairs.begin() + num_tasks_not_needed,
+                     pairs.end());
+
+    // The lowest-priority 'num_tasks_not_needed' stay in the 'info' struct.
+    info->tasks.clear();
+    for (int32 i = 0; i < num_tasks_not_needed; i++)
+      info->tasks.push_back(pairs[i].second);
+    // The highest-priority 'num_tasks_needed' tasks go to the output 'tasks'
+    // array.
+    for (int32 i = num_tasks_not_needed; i < num_tasks_present; i++)
+      tasks->push_back(pairs[i].second);
+    // The following assertion checks that the is_edge and is_irregular values
+    // are the same for the entire minibatch, which they should always be.
+    KALDI_ASSERT(GetMinibatchSize(*info) == minibatch_size);
+  }
+
+  {
+    // This block updates num_full_minibatches_ and notifies threads waiting on
+    // any related condition variable.
+    int32 new_num_tasks_present = info->tasks.size(),
+        full_minibatch_reduction =
+        (num_tasks_present / minibatch_size) -
+        (new_num_tasks_present / minibatch_size);
+    for (int32 i = 0; i < full_minibatch_reduction; i++) {
+      num_full_minibatches_--;
+      KALDI_ASSERT(num_full_minibatches_ >= 0);
+      std::unordered_map<int32, std::condition_variable*>::const_iterator
+          iter = no_more_than_n_minibatches_full_.find(num_full_minibatches_);
+      if (iter != no_more_than_n_minibatches_full_.end()) {
+        std::condition_variable *cond = iter->second;
+        cond->notify_all();
+      }
+    }
+  }
+}
+
+
+int32 NnetBatchComputer::GetMinibatchSize(
+    const ComputationGroupInfo &info) const {
+  if (info.tasks.empty()) {
+    return opts_.minibatch_size; // actually it shouldn't matter what we return
+                                 // in this case.
+  }
+  const NnetInferenceTask &task = *(info.tasks[0]);
+  if (task.is_irregular)
+    return 1;
+  else if (task.is_edge)
+    return opts_.edge_minibatch_size;
+  else
+    return opts_.minibatch_size;
+}
+
+int32 NnetBatchComputer::GetActualMinibatchSize(
+    const ComputationGroupInfo &info) const {
+  KALDI_ASSERT(!info.tasks.empty());
+  int32 num_tasks = info.tasks.size(),
+      this_minibatch_size = GetMinibatchSize(info);
+  KALDI_ASSERT(num_tasks > 0);
+  while (num_tasks <
+         int32(opts_.partial_minibatch_factor * this_minibatch_size))
+    this_minibatch_size *= opts_.partial_minibatch_factor;
+  return int32(this_minibatch_size);
+}
+
+
+std::shared_ptr<const NnetComputation> NnetBatchComputer::GetComputation(
+    const ComputationGroupInfo &info,
+    int32 minibatch_size) {
+  KALDI_ASSERT(!info.tasks.empty());
+  // note: all the tasks will have the same structure, in the respects that
+  // would affect the computation.
+  NnetInferenceTask *example_task = info.tasks[0];
+  ComputationRequest request;
+  GetComputationRequest(*example_task, minibatch_size, &request);
+  return compiler_.Compile(request);
+}
+
+
+double NnetBatchComputer::GetPriority(bool allow_partial_minibatch,
+                                      const ComputationGroupInfo &info) const {
+  if (info.tasks.empty())
+    return -std::numeric_limits<double>::infinity();
+  int32 this_minibatch_size = GetMinibatchSize(info);
+  int32 num_tasks = info.tasks.size();
+
+  if (!allow_partial_minibatch && num_tasks < this_minibatch_size)
+    return -std::numeric_limits<double>::infinity();
+
+  // penalty_for_not_full will be negative if the minibatch is not full, up to a
+  // maximum of 10.  the 10 is a heuristic; it could be changed.
+  // Note: the penalty is effectively infinity if allow_partial_minibatch == false;
+  // see the 'return' above.
+  double proportion_full = std::min<int32>(num_tasks, this_minibatch_size) /
+      double(this_minibatch_size),
+      penalty_for_not_full = 10.0 * (proportion_full - 1.0),
+      task_priority_sum = 0.0;
+
+
+  if (num_tasks > this_minibatch_size) {
+    // Get the average of the priorities of the highest-priority tasks (no more
+    // than 'minibatch_size' of them.
+    std::vector<double> priorities;
+    priorities.resize(num_tasks);
+    for (int32 i = 0; i < num_tasks; i++)
+      priorities[i] = info.tasks[i]->priority;
+    // sort from greatest to least.
+    std::nth_element(priorities.begin(),
+                     priorities.begin() + this_minibatch_size,
+                     priorities.end(),
+                     std::greater<double>());
+    for (int32 i = 0; i < this_minibatch_size; i++)
+      task_priority_sum += priorities[i];
+    return penalty_for_not_full + task_priority_sum / this_minibatch_size;
+  } else {
+    for (int32 i = 0; i < num_tasks; i++)
+      task_priority_sum += info.tasks[i]->priority;
+    return penalty_for_not_full + task_priority_sum / num_tasks;
+  }
+}
+
+
+// static
+void NnetBatchComputer::GetComputationRequest(
+    const NnetInferenceTask &task,
+    int32 minibatch_size,
+    ComputationRequest *request) {
+  request->need_model_derivative = false;
+  request->store_component_stats = false;
+  request->inputs.reserve(2);
+
+  int32 num_input_frames = task.input.NumRows(),
+      first_input_t = task.first_input_t,
+      num_output_frames = task.num_output_frames,
+      output_t_stride = task.output_t_stride;
+  bool has_ivector = (task.ivector.Dim() != 0);
+
+  std::vector<Index> input_indexes, ivector_indexes, output_indexes;
+  input_indexes.reserve(minibatch_size * num_input_frames);
+  output_indexes.reserve(minibatch_size * num_output_frames);
+  if (has_ivector)
+    ivector_indexes.reserve(minibatch_size);
+
+  for (int32 n = 0; n < minibatch_size; n++) {
+    for (int32 t = first_input_t; t < first_input_t + num_input_frames; t++)
+      input_indexes.push_back(Index(n, t, 0));
+    if (has_ivector)
+      ivector_indexes.push_back(Index(n, 0, 0));
+    for (int32 t = 0; t < num_output_frames; t++)
+      output_indexes.push_back(Index(n, t * output_t_stride, 0));
+  }
+  request->inputs.push_back(IoSpecification("input", input_indexes));
+  if (has_ivector)
+    request->inputs.push_back(IoSpecification("ivector", ivector_indexes));
+  request->outputs.push_back(IoSpecification("output", output_indexes));
+}
+
+
+
+void NnetBatchComputer::CheckAndFixConfigs() {
+  static bool warned_frames_per_chunk = false;
+  int32 nnet_modulus = nnet_.Modulus();
+  if (opts_.frame_subsampling_factor < 1 ||
+      opts_.frames_per_chunk < 1) {
+    KALDI_ERR << "--frame-subsampling-factor and "
+              << "--frames-per-chunk must be > 0";
+  }
+  KALDI_ASSERT(nnet_modulus > 0);
+  int32 n = Lcm(opts_.frame_subsampling_factor, nnet_modulus);
+
+  if (opts_.frames_per_chunk % n != 0) {
+    // round up to the nearest multiple of n.
+    int32 frames_per_chunk = n * ((opts_.frames_per_chunk + n - 1) / n);
+    if (!warned_frames_per_chunk) {
+      warned_frames_per_chunk = true;
+      if (nnet_modulus == 1) {
+        // simpler error message.
+        KALDI_LOG << "Increasing --frames-per-chunk from "
+                  << opts_.frames_per_chunk << " to "
+                  << frames_per_chunk << " to make it a multiple of "
+                  << "--frame-subsampling-factor="
+                  << opts_.frame_subsampling_factor;
+      } else {
+        KALDI_LOG << "Increasing --frames-per-chunk from "
+                  << opts_.frames_per_chunk << " to "
+                  << frames_per_chunk << " due to "
+                  << "--frame-subsampling-factor="
+                  << opts_.frame_subsampling_factor << " and "
+                  << "nnet shift-invariance modulus = " << nnet_modulus;
+      }
+    }
+    opts_.frames_per_chunk = frames_per_chunk;
+  }
+  KALDI_ASSERT(opts_.minibatch_size >= 1 &&
+               opts_.edge_minibatch_size >= 1 &&
+               opts_.partial_minibatch_factor < 1.0 &&
+               opts_.partial_minibatch_factor >= 0.0);
+}
+
+
+void NnetBatchComputer::FormatInputs(
+    int32 minibatch_size,
+    const std::vector<NnetInferenceTask*> &tasks,
+    CuMatrix<BaseFloat> *input,
+    CuMatrix<BaseFloat> *ivector) {
+  int32 num_input_frames = tasks[0]->input.NumRows(),
+      input_dim = tasks[0]->input.NumCols(),
+      ivector_dim = tasks[0]->ivector.Dim(),
+      num_tasks = tasks.size();
+  KALDI_ASSERT(num_tasks > 0 && num_tasks <= minibatch_size);
+
+  // We first aggregate the input frames and i-vectors in matrices on the CPU,
+  // and then transfer them to the GPU.  Later on we'll change this code to
+  // used pinned memory.
+  Matrix<BaseFloat> input_cpu(num_tasks * num_input_frames, input_dim,
+                              kUndefined);
+
+
+  for (int32 n = 0; n < num_tasks; n++) {
+    SubMatrix<BaseFloat> input_part(input_cpu,
+                                    n * num_input_frames, num_input_frames,
+                                    0, input_dim);
+    input_part.CopyFromMat(tasks[n]->input);
+  }
+  input->Resize(minibatch_size * num_input_frames, input_dim,
+                kUndefined);
+  input->RowRange(0, num_tasks * num_input_frames).CopyFromMat(input_cpu);
+  if (num_tasks < minibatch_size) {
+    // The following will make things easier to debug if something fails, but
+    // shouldn't be strictly necessary.
+    // the -1 means 'take all remaining rows'.
+    input->RowRange(num_tasks * num_input_frames,
+                    (minibatch_size - num_tasks) * num_input_frames).SetZero();
+  }
+
+  if (ivector_dim != 0) {
+    Matrix<BaseFloat> ivectors_cpu(num_tasks, ivector_dim, kUndefined);
+    for (int32 n = 0; n < num_tasks; n++)
+      ivectors_cpu.Row(n).CopyFromVec(tasks[n]->ivector);
+
+    ivector->Resize(minibatch_size, ivector_dim, kUndefined);
+    ivector->RowRange(0, num_tasks).CopyFromMat(ivectors_cpu);
+
+    if (num_tasks < minibatch_size) {
+      // The following will make things easier to debug if something fails, but
+      // shouldn't be strictly necessary.
+      // the -1 means 'take all remaining rows'.
+      ivector->RowRange(num_tasks, minibatch_size - num_tasks).SetZero();
+    }
+  }
+}
+
+void NnetBatchComputer::FormatOutputs(
+    const CuMatrix<BaseFloat> &output,
+    const std::vector<NnetInferenceTask*> &tasks) {
+  KALDI_ASSERT(!tasks.empty());
+  int32 num_output_frames = tasks[0]->num_output_frames,
+      output_dim = output.NumCols(),
+      num_tasks = tasks.size();
+  bool did_output_to_gpu = false;
+
+  // Note: it may not be optimal to do so many individual calls to copy the
+  // output to CPU; we'd have to test that, as I'm not sure how much the latency
+  // of a GPU call is.  On the other hand, the downsides of one big call are
+  // that we'd have to make another copy in CPU memory; and also we might not be
+  // able to take advantage if not all frames of the output are used.
+
+  // Also, we should probably used pinned memory.
+
+  // We don't bother zeroing frames of the output that are unused, but you could
+  // un-comment the commented lines of code below to do so.
+  for (int32 n = 0; n < num_tasks; n++) {
+    NnetInferenceTask *task = tasks[n];
+
+    int32 left_unused = task->num_initial_unused_output_frames,
+        used = task->num_used_output_frames;
+     // int32 right_unused = num_output_frames - used - left_unused;
+
+    if (task->output_to_cpu) {
+      task->output_cpu.Resize(num_output_frames, output_dim,
+                              kUndefined);
+      // if (left_unused > 0)
+      //   task->output_cpu.RowRange(0, left_unused).SetZero();
+      task->output_cpu.RowRange(left_unused, used).CopyFromMat(
+          output.RowRange(n * num_output_frames + left_unused, used));
+      // if (right_unused > 0)
+      //   task->output_cpu.RowRange(0, left_unused + used, right_unused).SetZero();
+    } else {
+      did_output_to_gpu = true;
+      task->output.Resize(num_output_frames, output_dim,
+                          kUndefined);
+      // if (left_unused > 0)
+      //   task->output.RowRange(0, left_unused).SetZero();
+      task->output.RowRange(left_unused, used).CopyFromMat(
+          output.RowRange(n * num_output_frames + left_unused, used));
+      // if (right_unused > 0)
+      //   task->output.RowRange(0, left_unused + used, right_unused).SetZero();
+    }
+  }
+  // The output of this function will likely be consumed by another thread.
+  // The following call will make sure the relevant kernels complete before
+  // any kernels from the other thread use the output.
+  if (did_output_to_gpu)
+    SynchronizeGpu();
+}
+
+void NnetBatchComputer::AcceptTask(NnetInferenceTask *task,
+                                   int32 max_minibatches_full) {
+  std::unique_lock<std::mutex> lock(mutex_);
+
+  if (max_minibatches_full > 0 && num_full_minibatches_ > max_minibatches_full) {
+    std::unordered_map<int32, std::condition_variable*>::iterator
+        iter = no_more_than_n_minibatches_full_.find(max_minibatches_full);
+    std::condition_variable *cond;
+    if (iter != no_more_than_n_minibatches_full_.end()) {
+      cond = iter->second;
+    } else {
+      cond = new std::condition_variable();
+      no_more_than_n_minibatches_full_[max_minibatches_full] = cond;
+    }
+    while (num_full_minibatches_ > max_minibatches_full)
+      cond->wait(lock);
+  }
+  ComputationGroupKey key(*task);
+  ComputationGroupInfo &info = tasks_[key];
+  info.tasks.push_back(task);
+  int32 minibatch_size = GetMinibatchSize(info);
+  if (static_cast<int32>(info.tasks.size()) % minibatch_size == 0)
+    num_full_minibatches_++;
+}
+
+bool NnetBatchComputer::Compute(bool allow_partial_minibatch) {
+  int32 minibatch_size;
+  std::vector<NnetInferenceTask*> tasks;
+  MinibatchSizeInfo *minfo =
+      GetHighestPriorityComputation(allow_partial_minibatch,
+                                    &minibatch_size,
+                                    &tasks);
+  if (minfo == NULL)
+    return false;
+
+  Timer tim;
+  Nnet *nnet_to_update = NULL;  // we're not doing any update
+  NnetComputer computer(opts_.compute_config, *(minfo->computation),
+                        nnet_, nnet_to_update);
+
+
+  CuMatrix<BaseFloat> input;
+  CuMatrix<BaseFloat> ivector;
+  FormatInputs(minibatch_size, tasks, &input, &ivector);
+  computer.AcceptInput("input", &input);
+  if (ivector.NumRows() != 0)
+    computer.AcceptInput("ivector", &ivector);
+  computer.Run();
+  CuMatrix<BaseFloat> output;
+  computer.GetOutputDestructive("output", &output);
+  if (log_priors_.Dim() != 0) {
+    output.AddVecToRows(-1.0, log_priors_);
+  }
+  output.Scale(opts_.acoustic_scale);
+  FormatOutputs(output, tasks);
+
+  // Update the stats, for diagnostics.
+  minfo->num_done++;
+  minfo->tot_num_tasks += static_cast<int64>(tasks.size());
+  minfo->seconds_taken += tim.Elapsed();
+
+
+  SynchronizeGpu();
+
+  for (size_t i = 0; i < tasks.size(); i++)
+    tasks[i]->semaphore.Signal();
+
+  return true;
+}
+
+
+/**
+   This namespace contains things needed for the implementation of
+   the function NnetBatchComputer::SplitUtteranceIntoTasks().
+ */
+namespace utterance_splitting {
+/**
+   This function figures out how many chunks are needed for this utterance,
+   sets 'tasks' to a vector with that many elements, and sets up the
+   following elements in 'tasks':
+   output_t_stride
+   num_output_frames
+   num_initial_unused_output_frames
+   num_used_output_frames
+   @param [in] opts   Options class
+   @param [in] num_subsampled_frames  The number of output frames in this
+   utterance.  Must be > 0.
+   @param [in] num_subsampled_frames_per_chunk  The number of output frames
+   per chunk
+   @param [out] The 'tasks' array is output to here; it will have one
+   task per chunk, with only the members 'output_t_stride',
+    'num_output_frames', 'num_initial_unused_output_frames',
+    'num_used_output_frames' and 'is_irregular' set up.
+*/
+void GetOutputFrameInfoForTasks(
+    const NnetBatchComputerOptions &opts,
+    int32 num_subsampled_frames,
+    int32 num_subsampled_frames_per_chunk,
+    std::vector<NnetInferenceTask> *tasks) {
+  KALDI_ASSERT(num_subsampled_frames > 0);
+  int32 fpc = num_subsampled_frames_per_chunk;
+  int32 num_tasks = (num_subsampled_frames + fpc - 1) / fpc;
+  tasks->resize(num_tasks);
+  for (int32 i = 0; i < num_tasks; i++) {
+    (*tasks)[i].output_t_stride = opts.frame_subsampling_factor;
+  }
+  if (num_subsampled_frames <= fpc) {  // there is one chunk.
+    KALDI_ASSERT(num_tasks == 1);  // TODO: remove this.
+    NnetInferenceTask &task = (*tasks)[0];
+    task.first_used_output_frame_index = 0;
+    if (opts.ensure_exact_final_context) {
+      task.num_output_frames = num_subsampled_frames;
+      task.num_initial_unused_output_frames = 0;
+      task.num_used_output_frames = num_subsampled_frames;
+      task.is_irregular = true;
+    } else {
+      task.num_output_frames = fpc;
+      task.num_initial_unused_output_frames = 0;
+      task.num_used_output_frames = num_subsampled_frames;
+      task.is_irregular = false;
+    }
+  } else {
+    for (int32 i = 0; i + 1 < num_tasks; i++) {
+      NnetInferenceTask &task = (*tasks)[i];
+      task.num_output_frames = fpc;
+      task.num_initial_unused_output_frames = 0;
+      task.num_used_output_frames = fpc;
+      task.first_used_output_frame_index = i * fpc;
+      task.is_irregular = false;
+    }
+    // The last chunk will end on the last frame of the file, but we won't use
+    // the part of its output that overlaps with the preceding chunk.
+    NnetInferenceTask &task = (*tasks)[num_tasks - 1];
+    task.num_output_frames = fpc;
+    task.num_initial_unused_output_frames = ((num_tasks - 1) * fpc) -
+        (num_subsampled_frames - fpc);
+    task.num_used_output_frames =
+        num_subsampled_frames - ((num_tasks - 1) * fpc);
+    task.first_used_output_frame_index = (num_tasks - 1) * fpc;
+    task.is_irregular = false;
+  }
+
+  if (true) {
+    // Do some checking.  TODO: remove this.
+    KALDI_ASSERT((*tasks)[0].first_used_output_frame_index == 0);
+    for (int32 i = 1; i < num_tasks; i++) {
+      KALDI_ASSERT((*tasks)[i].first_used_output_frame_index ==
+                   (*tasks)[i-1].first_used_output_frame_index +
+                   (*tasks)[i-1].num_used_output_frames);
+    }
+    KALDI_ASSERT((*tasks)[num_tasks-1].first_used_output_frame_index +
+                 (*tasks)[num_tasks-1].num_used_output_frames ==
+                 num_subsampled_frames);
+    for (int32 i = 0; i < num_tasks; i++) {
+      const NnetInferenceTask &task = (*tasks)[i];
+      KALDI_ASSERT(task.num_used_output_frames +
+                   task.num_initial_unused_output_frames <=
+                   task.num_output_frames);
+    }
+  }
+}
+
+void AddOnlineIvectorsToTasks(
+    const NnetBatchComputerOptions &opts,
+    const Matrix<BaseFloat> &online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+  int32 f = opts.frame_subsampling_factor,
+      num_tasks = tasks->size();
+  for (int32 i = 0; i < num_tasks; i++) {
+    NnetInferenceTask &task = (*tasks)[i];
+    // begin_output_t and end_output_t are the subsampled frame indexes at
+    // the output; you'd have to multiply them by f to get real frame indexes.
+    int32 begin_output_t = task.first_used_output_frame_index -
+        task.num_initial_unused_output_frames,
+        mid_output_t = begin_output_t + (task.num_output_frames / 2),
+        mid_input_t = mid_output_t * f,
+        ivector_frame = mid_input_t / online_ivector_period,
+        num_ivector_frames = online_ivectors.NumRows(),
+        margin_in_frames = 20,
+        margin_in_ivector_frames =
+        (margin_in_frames + online_ivector_period - 1) / online_ivector_period;
+    // the 'margin' is our tolerance for when the number of rows of
+    // 'online_ivectors' is less than what we expected; we allow 20 frames of
+    // tolerance in the numbering of the original (input) features.
+    if (ivector_frame >= num_ivector_frames) {
+      if (num_ivector_frames > 0 && ivector_frame > num_ivector_frames -
+          margin_in_ivector_frames) {
+        ivector_frame = num_ivector_frames - 1;  // Just take the last available one.
+      } else {
+        KALDI_ERR << "Could not get iVector for frame " << ivector_frame
+                  << ", online-ivectors matrix has "
+                  << online_ivectors.NumRows()
+                  << " rows.  Mismatched --online-ivector-period?";
+      }
+    }
+    task.ivector = online_ivectors.Row(ivector_frame);
+  }
+}
+
+
+
+/**
+   This function sets up the 'input' and 'first_input_t' and 'is_edge' members
+   of the 'tasks' array; it is responsible for working out, for each task,
+   which input frames it needs (including left-context and right-context).
+
+   The 'nnet_left_context' and 'nnet_right_context' are the inherent left
+   and right context of the network (num-frames required on left and right
+   to compute an output frame), and may be computed by doing:
+    ComputeSimpleNnetContext(nnet, &nnet_left_context_, &nnet_right_context_)
+*/
+static void SplitInputToTasks(const NnetBatchComputerOptions &opts,
+                              int32 nnet_left_context,
+                              int32 nnet_right_context,
+                              const Matrix<BaseFloat> &input,
+                              std::vector<NnetInferenceTask> *tasks) {
+  int32 num_input_frames = input.NumRows(),
+      f = opts.frame_subsampling_factor,
+      num_subsampled_frames = (num_input_frames + f - 1) / f,
+      extra_left_context_initial = (opts.extra_left_context_initial < 0 ?
+                                    opts.extra_left_context :
+                                    opts.extra_left_context_initial),
+      extra_right_context_final = (opts.extra_right_context_final < 0 ?
+                                   opts.extra_right_context :
+                                   opts.extra_right_context_final),
+      num_tasks = tasks->size();
+  for (int32 i = 0; i < num_tasks; i++) {
+    NnetInferenceTask &task = (*tasks)[i];
+    // begin_output_t and end_output_t are the subsampled frame indexes at
+    // the output; you'd have to multiply them by f to get real frame indexes.
+    int32 begin_output_t = task.first_used_output_frame_index -
+        task.num_initial_unused_output_frames,
+        end_output_t = begin_output_t + task.num_output_frames;
+    // begin_input_t and end_input_t are the real 't' values corresponding to
+    // begin_output_t and end_output_t; they are the beginning and end
+    // (i.e. first and last-plus-one) frame indexes without any left or right
+    // context.
+    int32 begin_input_t = begin_output_t * f,
+        end_input_t = end_output_t * f;
+    // Detect whether the left and right edges touch (or pass over) the left
+    // and right boundaries.  Note: we don't expect begin_output_t to ever be
+    // negative.
+    bool left_edge = (begin_output_t <= 0),
+        right_edge = (end_output_t >= num_subsampled_frames);
+    int32 tot_left_context = nnet_left_context +
+        (left_edge ? extra_left_context_initial : opts.extra_left_context),
+        tot_right_context = nnet_right_context +
+        (right_edge ? extra_right_context_final : opts.extra_right_context);
+
+    // 'is_edge' is only true if it's an edge minibatch *and* its being an
+    // edge actually made a difference to the structure of the example.
+    task.is_edge =
+        (tot_left_context != nnet_left_context + opts.extra_left_context ||
+         tot_right_context !=  nnet_right_context + opts.extra_right_context);
+
+    int32 begin_input_t_padded = begin_input_t - tot_left_context,
+        end_input_t_padded = end_input_t + tot_right_context;
+
+    // 'task.first_input_t' is a representation of 'begin_input_t_padded' in a
+    // shifted/normalized numbering where the output time indexes start from
+    // zero.
+    task.first_input_t = begin_input_t_padded - (begin_output_t * f);
+
+    task.input.Resize(end_input_t_padded - begin_input_t_padded,
+                      input.NumCols(), kUndefined);
+    // the 't' value below is in the numbering of 'input'.
+    for (int32 t = begin_input_t_padded; t < end_input_t_padded; t++) {
+      int32 t_clipped = t;
+      if (t_clipped < 0) t_clipped = 0;
+      if (t_clipped >= num_input_frames) t_clipped = num_input_frames - 1;
+      SubVector<BaseFloat> dest(task.input,
+                                t - begin_input_t_padded),
+          src(input, t_clipped);
+      dest.CopyFromVec(src);
+    }
+  }
+}
+
+} // namespace utterance_splitting
+
+
+void NnetBatchComputer::SplitUtteranceIntoTasks(
+    bool output_to_cpu,
+    const Matrix<BaseFloat> &input,
+    const Vector<BaseFloat> *ivector,
+    const Matrix<BaseFloat> *online_ivectors,
+    int32 online_ivector_period,
+    std::vector<NnetInferenceTask> *tasks) {
+  using namespace utterance_splitting;
+
+
+  { // This block does some checking.
+    if (input.NumCols() != input_dim_) {
+      KALDI_ERR << "Input features did not have expected dimension: expected "
+          << input_dim_ << ", got " << input.NumCols();
+    }
+    int32 ivector_dim = (ivector != NULL ? ivector->Dim() :
+                         (online_ivectors != NULL ?
+                          online_ivectors->NumCols() : 0));
+    if (ivector_dim_ != 0 && ivector_dim == 0)
+      KALDI_ERR << "Model expects i-vectors but none were supplied";
+    else if (ivector_dim_ == 0 && ivector_dim != 0)
+      KALDI_ERR << "You supplied i-vectors but model does not expect them.";
+    else if (ivector_dim != ivector_dim_)
+      KALDI_ERR << "I-vector dimensions mismatch: model expects "
+                << ivector_dim_ << ", you supplied " << ivector_dim;
+  }
+
+
+  int32 num_input_frames = input.NumRows(),
+      f = opts_.frame_subsampling_factor,
+      num_subsampled_frames = (num_input_frames + f - 1) / f,
+      num_subsampled_frames_per_chunk = opts_.frames_per_chunk / f;
+
+  GetOutputFrameInfoForTasks(opts_, num_subsampled_frames,
+                             num_subsampled_frames_per_chunk,
+                             tasks);
+
+  SplitInputToTasks(opts_, nnet_left_context_, nnet_right_context_,
+                    input, tasks);
+
+  if (ivector != NULL) {
+    KALDI_ASSERT(online_ivectors == NULL);
+    for (size_t i = 0; i < tasks->size(); i++)
+      (*tasks)[i].ivector = *ivector;
+  } else if (online_ivectors != NULL) {
+    AddOnlineIvectorsToTasks(opts_, *online_ivectors,
+                             online_ivector_period, tasks);
+  }
+
+  for (size_t i = 0; i < tasks->size(); i++) {
+    (*tasks)[i].output_to_cpu = output_to_cpu;
+    // The priority will be set by the user; this just avoids undefined
+    // behavior.
+    (*tasks)[i].priority = 0.0;
+  }
+}
+
+
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    Matrix<BaseFloat> *output) {
+  int32 num_tasks = tasks.size(),
+      num_output_frames = 0,
+      output_dim = -1;
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    num_output_frames += task.num_used_output_frames;
+    if (i == 0) {
+      output_dim = (task.output_to_cpu ?
+                    task.output_cpu.NumCols() :
+                    task.output.NumCols());
+    }
+  }
+  KALDI_ASSERT(num_output_frames != 0 && output_dim != 0);
+  int32 cur_output_frame = 0;
+  output->Resize(num_output_frames, output_dim);
+  for (int32 i = 0; i < num_tasks; i++) {
+    const NnetInferenceTask &task = tasks[i];
+    int32 skip = task.num_initial_unused_output_frames,
+        num_used = task.num_used_output_frames;
+    KALDI_ASSERT(cur_output_frame == task.first_used_output_frame_index);
+    if (task.output_to_cpu) {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output_cpu.RowRange(skip, num_used));
+    } else {
+      output->RowRange(cur_output_frame, num_used).CopyFromMat(
+          task.output.RowRange(skip, num_used));
+    }
+    cur_output_frame += num_used;
+  }
+  KALDI_ASSERT(cur_output_frame == num_output_frames);
+}
+
+
+NnetBatchInference::NnetBatchInference(
+    const NnetBatchComputerOptions &opts,
+    const Nnet &nnet,
+    const VectorBase<BaseFloat> &priors):
+    computer_(opts, nnet, priors),
+    is_finished_(false),
+    utterance_counter_(0) {
+  // 'thread_' will run the Compute() function in the background.
+  compute_thread_ = std::thread(ComputeFunc, this);
+}
+
+
+void NnetBatchInference::AcceptInput(
+    const std::string &utterance_id,
+    const Matrix<BaseFloat> &input,
+    const Vector<BaseFloat> *ivector,
+    const Matrix<BaseFloat> *online_ivectors,
+    int32 online_ivector_period) {
+
+  UtteranceInfo *info = new UtteranceInfo();
+  info->utterance_id = utterance_id;
+  info->num_tasks_finished = 0;
+  bool output_to_cpu = true;  // This wrapper is for when you need the nnet
+                              // output on CPU, e.g.  because you want it
+                              // written to disk.  If this needs to be
+                              // configurable in the future, we can make changes
+                              // then.
+  computer_.SplitUtteranceIntoTasks(
+      output_to_cpu, input, ivector, online_ivectors,
+      online_ivector_period, &(info->tasks));
+
+  // Setting this to a nonzero value will cause the AcceptTask() call below to
+  // hang until the computation thread has made some progress, if too much
+  // data is already queued.
+  int32 max_full_minibatches = 2;
+
+  // Earlier utterances have higher priority, which is important to make sure
+  // that their corresponding tasks are completed and they can be output to disk.
+  double priority = -1.0 * (utterance_counter_++);
+  for (size_t i = 0; i < info->tasks.size(); i++) {
+    info->tasks[i].priority = priority;
+    computer_.AcceptTask(&(info->tasks[i]), max_full_minibatches);
+  }
+  utts_.push_back(info);
+  tasks_ready_semaphore_.Signal();
+}
+
+bool NnetBatchInference::GetOutput(std::string *utterance_id,
+                                   Matrix<BaseFloat> *output) {
+  if (utts_.empty())
+    return false;
+
+  UtteranceInfo *info = *utts_.begin();
+  std::vector<NnetInferenceTask> &tasks = info->tasks;
+  int32 num_tasks = tasks.size();
+  for (; info->num_tasks_finished < num_tasks; ++info->num_tasks_finished) {
+    Semaphore &semaphore = tasks[info->num_tasks_finished].semaphore;
+    if (is_finished_) {
+      semaphore.Wait();
+    } else {
+      if (!semaphore.TryWait()) {
+        // If not all of the tasks of this utterance are ready yet,
+        // just return false.
+        return false;
+      }
+    }
+  }
+  MergeTaskOutput(tasks, output);
+  *utterance_id = info->utterance_id;
+  delete info;
+  utts_.pop_front();
+  return true;
+}
+
+NnetBatchInference::~NnetBatchInference() {
+  if (!is_finished_)
+    KALDI_ERR << "Object destroyed before Finished() was called.";
+  if (!utts_.empty())
+    KALDI_ERR << "You should get all output before destroying this object.";
+  compute_thread_.join();
+}
+
+void NnetBatchInference::Finished() {
+  is_finished_ = true;
+  tasks_ready_semaphore_.Signal();
+}
+
+// This is run as the thread of class NnetBatchInference.
+void NnetBatchInference::Compute() {
+  bool allow_partial_minibatch = false;
+  while (true) {
+    // keep calling Compute() as long as it makes progress.
+    while (computer_.Compute(allow_partial_minibatch));
+
+    // ... then wait on tasks_ready_semaphore_.
+    tasks_ready_semaphore_.Wait();
+    if (is_finished_) {
+      allow_partial_minibatch = true;
+      while (computer_.Compute(allow_partial_minibatch));
+      return;
+    }
+  }
+}
+
+
+NnetBatchDecoder::NnetBatchDecoder(
+    const fst::Fst<fst::StdArc> &fst,
+    const LatticeFasterDecoderConfig &decoder_opts,
+    const TransitionModel &trans_model,
+    const fst::SymbolTable *word_syms,
+    bool allow_partial,
+    int32 num_threads,
+    NnetBatchComputer *computer):
+  fst_(fst), decoder_opts_(decoder_opts),
+  trans_model_(trans_model), word_syms_(word_syms),
+  allow_partial_(allow_partial),  computer_(computer),
+  is_finished_(false), tasks_finished_(false), priority_offset_(0.0),
+  tot_like_(0.0), frame_count_(0), num_success_(0), num_fail_(0),
+  num_partial_(0) {
+  KALDI_ASSERT(num_threads > 0);
+  for (int32 i = 0; i < num_threads; i++)
+    decode_threads_.push_back(new std::thread(DecodeFunc, this));
+  compute_thread_ = std::thread(ComputeFunc, this);
+}
+
+void NnetBatchDecoder::SetPriorities(std::vector<NnetInferenceTask> *tasks) {
+  size_t num_tasks = tasks->size();
+  double priority_offset = priority_offset_;
+  for (size_t i = 0; i < num_tasks; i++)
+    (*tasks)[i].priority = priority_offset - (double)i;
+}
+
+void NnetBatchDecoder::UpdatePriorityOffset(double priority) {
+  size_t num_tasks = decode_threads_.size(),
+      new_weight = 1.0 / num_tasks,
+      old_weight = 1.0 - new_weight;
+  // The next line is vulnerable to a race condition but if it happened it
+  // wouldn't matter.
+  priority_offset_ = priority_offset_ * old_weight + priority * new_weight;
+}
+
+void NnetBatchDecoder::AcceptInput(
+    const std::string &utterance_id,
+    const Matrix<BaseFloat> &input,
+    const Vector<BaseFloat> *ivector,
+    const Matrix<BaseFloat> *online_ivectors,
+    int32 online_ivector_period){
+  // This function basically does a handshake with one of the decoder threads.
+  // It may have to wait till one of the decoder threads becomes ready.
+  input_utterance_.utterance_id = utterance_id;
+  input_utterance_.input = &input;
+  input_utterance_.ivector = ivector;
+  input_utterance_.online_ivectors = online_ivectors;
+  input_utterance_.online_ivector_period = online_ivector_period;
+
+
+  UtteranceOutput *this_output = new UtteranceOutput();
+  this_output->utterance_id = utterance_id;
+  pending_utts_.push_back(this_output);
+
+  input_ready_semaphore_.Signal();
+  input_consumed_semaphore_.Wait();
+}
+
+int32 NnetBatchDecoder::Finished() {
+  is_finished_ = true;
+  for (size_t i = 0; i < decode_threads_.size(); i++)
+    input_ready_semaphore_.Signal();
+  for (size_t i = 0; i < decode_threads_.size(); i++) {
+    decode_threads_[i]->join();
+    delete decode_threads_[i];
+    decode_threads_[i] = NULL;
+  }
+  // don't clear decode_threads_, since its size is needed in the destructor to
+  // compute timing.
+
+  tasks_finished_ = true;
+  tasks_ready_semaphore_.Signal();
+  compute_thread_.join();
+  return num_success_;
+}
+
+
+bool NnetBatchDecoder::GetOutput(
+    std::string *utterance_id,
+    CompactLattice *clat,
+    std::string *sentence) {
+  if (!decoder_opts_.determinize_lattice)
+    KALDI_ERR << "Don't call this version of GetOutput if you are "
+        "not determinizing.";
+  while (true) {
+    if (pending_utts_.empty())
+      return false;
+    if (!pending_utts_.front()->finished)
+      return false;
+    UtteranceOutput *this_output = pending_utts_.front();
+    pending_utts_.pop_front();
+    if (this_output->compact_lat.NumStates() == 0) {
+      delete this_output;
+      // ... and continue round the loop, without returning any output to the
+      // user for this utterance.  Something went wrong in decoding: for
+      // example, the user specified allow_partial == false and no final-states
+      // were active on the last frame, or something more unexpected.  A warning
+      // would have been printed in the decoder thread.
+    } else {
+      *clat = this_output->compact_lat;
+      utterance_id->swap(this_output->utterance_id);
+      sentence->swap(this_output->sentence);
+      delete this_output;
+      return true;
+    }
+  }
+}
+
+
+bool NnetBatchDecoder::GetOutput(
+    std::string *utterance_id,
+    Lattice *lat,
+    std::string *sentence) {
+  if (decoder_opts_.determinize_lattice)
+    KALDI_ERR << "Don't call this version of GetOutput if you are "
+        "determinizing.";
+  while (true) {
+    if (pending_utts_.empty())
+      return false;
+    if (!pending_utts_.front()->finished)
+      return false;
+    UtteranceOutput *this_output = pending_utts_.front();
+    pending_utts_.pop_front();
+    if (this_output->compact_lat.NumStates() == 0) {
+      delete this_output;
+      // ... and continue round the loop, without returning any output to the
+      // user for this utterance.  Something went wrong in decoding: for
+      // example, the user specified allow_partial == false and no final-states
+      // were active on the last frame, or something more unexpected.  A warning
+      // would have been printed in the decoder thread.
+    } else {
+      *lat = this_output->lat;  // OpenFST has shallow copy so no need to swap.
+      utterance_id->swap(this_output->utterance_id);
+      sentence->swap(this_output->sentence);
+      delete this_output;
+      return true;
+    }
+  }
+}
+
+void NnetBatchDecoder::Compute() {
+  while (!tasks_finished_) {
+    tasks_ready_semaphore_.Wait();
+    bool allow_partial_minibatch = true;
+    while (computer_->Compute(allow_partial_minibatch));
+  }
+}
+
+void NnetBatchDecoder::Decode() {
+  while (true) {
+    input_ready_semaphore_.Wait();
+    if (is_finished_)
+      return;
+
+    std::vector<NnetInferenceTask> tasks;
+    std::string utterance_id;
+    // we can be confident that the last element of 'pending_utts_' is the one
+    // for this utterance, as we know exactly at what point in the code the main
+    // thread will be in AcceptInput().
+    UtteranceOutput *output_utterance = pending_utts_.back();
+    {
+      UtteranceInput input_utterance(input_utterance_);
+      utterance_id = input_utterance.utterance_id;
+      bool output_to_cpu = true;
+      computer_->SplitUtteranceIntoTasks(output_to_cpu,
+                                         *(input_utterance.input),
+                                         input_utterance.ivector,
+                                         input_utterance.online_ivectors,
+                                         input_utterance.online_ivector_period,
+                                         &tasks);
+      KALDI_ASSERT(output_utterance->utterance_id == utterance_id);
+      input_consumed_semaphore_.Signal();
+      // Now let input_utterance go out of scope; it's no longer valid as it may
+      // be overwritten by something else.
+    }
+
+    SetPriorities(&tasks);
+    for (size_t i = 0; i < tasks.size(); i++)
+      computer_->AcceptTask(&(tasks[i]));
+    tasks_ready_semaphore_.Signal();
+
+    {
+      int32 frame_offset = 0;
+      LatticeFasterDecoder decoder(fst_, decoder_opts_);
+      decoder.InitDecoding();
+
+
+      for (size_t i = 0; i < tasks.size(); i++) {
+        NnetInferenceTask &task = tasks[i];
+        task.semaphore.Wait();
+        UpdatePriorityOffset(task.priority);
+
+        SubMatrix<BaseFloat> post(task.output_cpu,
+                                  task.num_initial_unused_output_frames,
+                                  task.num_used_output_frames,
+                                  0, task.output_cpu.NumCols());
+        DecodableMatrixMapped decodable(trans_model_, post, frame_offset);
+        frame_offset += post.NumRows();
+        decoder.AdvanceDecoding(&decodable);
+        task.output.Resize(0, 0);  // Free some memory.
+      }
+
+      bool use_final_probs = true;
+      if (!decoder.ReachedFinal()) {
+        if (allow_partial_) {
+          KALDI_WARN << "Outputting partial output for utterance "
+                     << utterance_id << " since no final-state reached\n";
+          use_final_probs = false;
+          std::unique_lock<std::mutex> lock(stats_mutex_);
+          num_partial_++;
+        } else {
+          KALDI_WARN << "Not producing output for utterance " << utterance_id
+                     << " since no final-state reached and "
+                     << "--allow-partial=false.\n";
+          std::unique_lock<std::mutex> lock(stats_mutex_);
+          num_fail_++;
+          continue;
+        }
+      }
+      // if we reached this point, we are getting a lattice.
+      decoder.GetRawLattice(&output_utterance->lat, use_final_probs);
+      // Let the decoder and the decodable object go out of scope, to save
+      // memory.
+    }
+    ProcessOutputUtterance(output_utterance);
+  }
+}
+
+
+void NnetBatchDecoder::UtteranceFailed() {
+  std::unique_lock<std::mutex> lock(stats_mutex_);
+  num_fail_++;
+}
+
+void NnetBatchDecoder::ProcessOutputUtterance(UtteranceOutput *output) {
+  fst::Connect(&(output->lat));
+  if (output->lat.NumStates() == 0) {
+    KALDI_WARN << "Unexpected problem getting lattice for utterance "
+               << output->utterance_id;
+    std::unique_lock<std::mutex> lock(stats_mutex_);
+    num_fail_++;
+    return;
+  }
+
+  { // This block accumulates diagnostics, prints log messages, and sets
+    // output->sentence.
+    Lattice best_path;
+    LatticeWeight weight;
+    ShortestPath(output->lat, &best_path);
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    GetLinearSymbolSequence(best_path, &alignment, &words, &weight);
+    int32 num_frames = alignment.size();
+    if (word_syms_ != NULL) {
+      std::ostringstream os;
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms_->Find(words[i]);
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+        os << s << ' ';
+      }
+      output->sentence = os.str();
+    }
+    double likelihood = -(weight.Value1() + weight.Value2());
+    // Note: these logging messages will be out-of-order w.r.t. the transcripts
+    // that are printed to cerr; we keep those transcripts in the same order
+    // that the utterances were in, but these logging messages may be out of
+    // order (due to multiple threads).
+    KALDI_LOG << "Log-like per frame for utterance " << output->utterance_id
+              << " is " << (likelihood / num_frames) << " over "
+              << num_frames << " frames.";
+    KALDI_VLOG(2) << "Cost for utterance " << output->utterance_id << " is "
+                  << weight.Value1() << " + " << weight.Value2();
+
+    std::unique_lock<std::mutex> lock(stats_mutex_);
+    tot_like_ += likelihood;
+    frame_count_ += num_frames;
+    num_success_ += 1;
+  }
+
+  if (decoder_opts_.determinize_lattice) {
+    if (!DeterminizeLatticePhonePrunedWrapper(
+            trans_model_,
+            &output->lat,
+            decoder_opts_.lattice_beam,
+            &(output->compact_lat),
+            decoder_opts_.det_opts))
+      KALDI_WARN << "Determinization finished earlier than the beam for "
+                 << "utterance " << output->utterance_id;
+    output->lat.DeleteStates();  // Save memory.
+  }
+
+  // We'll write the lattice without acoustic scaling, so we need to reverse
+  // the scale that we applied when decoding.
+  BaseFloat acoustic_scale = computer_->GetOptions().acoustic_scale;
+  if (acoustic_scale != 0.0) {
+    if (decoder_opts_.determinize_lattice)
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale),
+                        &(output->compact_lat));
+    else
+      fst::ScaleLattice(fst::AcousticLatticeScale(1.0 / acoustic_scale),
+                        &(output->lat));
+  }
+  output->finished = true;
+}
+
+
+
+NnetBatchDecoder::~NnetBatchDecoder() {
+  if (!is_finished_ || !pending_utts_.empty()) {
+    // At this point the application is bound to fail so raising another
+    // exception is not a big problem.
+    KALDI_ERR << "Destroying NnetBatchDecoder object without calling "
+        "Finished() and consuming the remaining output";
+  }
+  // Print diagnostics.
+
+  kaldi::int64 input_frame_count =
+      frame_count_ * computer_->GetOptions().frame_subsampling_factor;
+  int32 num_threads = static_cast<int32>(decode_threads_.size());
+
+  KALDI_LOG << "Overall likelihood per frame was "
+            << tot_like_ / std::max<int64>(1, frame_count_)
+            << " over " << frame_count_ << " frames.";
+
+  double elapsed = timer_.Elapsed();
+  // the +1 below is just to avoid division-by-zero errors.
+  KALDI_LOG << "Time taken "<< elapsed
+            << "s: real-time factor assuming 100 frames/sec is "
+            << (num_threads * elapsed * 100.0 /
+                std::max<int64>(input_frame_count, 1))
+            << " (per thread; with " << num_threads << " threads).";
+  KALDI_LOG << "Done " << num_success_ << " utterances ("
+            << num_partial_ << " forced out); failed for "
+            << num_fail_;
+}
+
+
+}  // namespace nnet3
+}  // namespace kaldi
diff --git a/src/nnet3/nnet-batch-compute.h b/src/nnet3/nnet-batch-compute.h
new file mode 100644
index 00000000000..9861a28976c
--- /dev/null
+++ b/src/nnet3/nnet-batch-compute.h
@@ -0,0 +1,836 @@
+// nnet3/nnet-batch-compute.h
+
+// Copyright 2012-2018  Johns Hopkins University (author: Daniel Povey)
+//                2018       Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef KALDI_NNET3_NNET_BATCH_COMPUTE_H_
+#define KALDI_NNET3_NNET_BATCH_COMPUTE_H_
+
+#include <vector>
+#include <string>
+#include <list>
+#include <utility>
+#include <condition_variable>
+#include "base/kaldi-common.h"
+#include "gmm/am-diag-gmm.h"
+#include "hmm/transition-model.h"
+#include "itf/decodable-itf.h"
+#include "nnet3/nnet-optimize.h"
+#include "nnet3/nnet-compute.h"
+#include "nnet3/am-nnet-simple.h"
+#include "nnet3/nnet-am-decodable-simple.h"
+#include "decoder/lattice-faster-decoder.h"
+#include "util/stl-utils.h"
+
+
+namespace kaldi {
+namespace nnet3 {
+
+
+/**
+   class NnetInferenceTask represents a chunk of an utterance that is
+   requested to be computed.  This will be given to NnetBatchComputer, which
+   will aggregate the tasks and complete them.
+ */
+struct NnetInferenceTask {
+  // The copy constructor is required to exist because of std::vector's resize()
+  // function, but in practice should never be used.
+  NnetInferenceTask(const NnetInferenceTask &other) {
+    KALDI_ERR << "NnetInferenceTask was not designed to be copied.";
+  }
+  NnetInferenceTask() { }
+
+
+  // The input frames, which are treated as being numbered t=0, t=1, etc.  (If
+  // the lowest t value was originally nonzero in the 'natural' numbering, this
+  // just means we conceptually shift the 't' values; the only real constraint
+  // is that the 't' values are contiguous.
+  Matrix<BaseFloat> input;
+
+  // The index of the first output frame (in the shifted numbering where the
+  // first output frame is numbered zero.  This will typically be less than one,
+  // because most network topologies require left context.  If this was an
+  // 'interior' chunk of a recurrent topology like LSTMs, first_input_t may be
+  // substantially less than zero, due to 'extra_left_context'.
+  int32 first_input_t;
+
+  // The stride of output 't' values: e.g., will be 1 for normal-frame-rate
+  // models, and 3 for low-frame-rate models such as chain models.
+  int32 output_t_stride;
+
+  // The number of output 't' values (they will start from zero and be separated
+  // by output_t_stride).  This will be the num-rows of 'output'.
+  int32 num_output_frames;
+
+  // 'num_initial_unused_output_frames', which will normally be zero, is the
+  // number of rows of the output matrix ('output' or 'output_cpu') which won't
+  // actually be needed by the user, usually because they overlap with a
+  // previous chunk.  This can happen because the number of outputs isn't a
+  // multiple of the number of chunks.
+  int32 num_initial_unused_output_frames;
+
+  // 0 < num_used_output_frames <= num_output_frames - num_initial_unused_output_frames
+  // is the number of output frames which are actually going to be used by the
+  // user.  (Due to edge effects, not all are necessarily used).
+  int32 num_used_output_frames;
+
+  // first_used_output_frame_index is provided for the convenience of the user
+  // so that they can know how this chunk relates to the utterance which it is
+  // a part of.
+  // It represents an output frame index in the original utterance-- after
+  // subsampling; so not a 't' value but a 't' value divided by
+  // frame-subsampling-factor.  Specifically, it tells you the row index in the
+  // full utterance's output which corresponds to the first 'used' frame index
+  // at the output of this chunk, specifically: the row numbered
+  // 'num_initial_unused_output_frames' in the 'output' or 'output_cpu' data
+  // member.
+  int32 first_used_output_frame_index;
+
+  // True if this chunk is an 'edge' (the beginning or end of an utterance) AND
+  // is structurally different somehow from non-edge chunk, e.g. requires less
+  // context.  This is present only so that NnetBatchComputer will know the
+  // appropriate minibatch size to use.
+  bool is_edge;
+
+  // True if this task represents an irregular-sized chunk.  These can happen
+  // only for utterances that are shorter than the requested minibatch size, and
+  // it should be quite rare.  We use a minibatch size of 1 in this case.
+  bool is_irregular;
+
+  // The i-vector for this chunk, if this network accepts i-vector inputs.
+  Vector<BaseFloat> ivector;
+
+  // A priority (higher is more urgent); may be either sign.  May be updated
+  // after this object is provided to class NnetBatchComputer.
+  double priority;
+
+  // This semaphore will be incremented by class NnetBatchComputer when this
+  // chunk is done.  After this semaphore is incremented, class
+  // NnetBatchComputer will no longer hold any pointers to this class.
+  Semaphore semaphore;
+
+  // Will be set to true by the caller if they want the output of the neural net
+  // to be copied to CPU (to 'output').  If false, the output will stay on
+  // the GPU (if used)- in cu_output.
+  bool output_to_cpu;
+
+  // The neural net output, of dimension num_output_frames by the output-dim of
+  // the neural net, will be written to 'output_cpu' if 'output_to_cpu' is true.
+  // This is expected to be empty when this task is provided to class
+  // NnetBatchComputer, and will be nonempty (if output_to_cpu == true) when the
+  // task is completed and the semaphore is signaled.
+  Matrix<BaseFloat> output_cpu;
+
+  // The output goes here instead of 'output_to_cpu' is false.
+  CuMatrix<BaseFloat> output;
+};
+
+
+struct NnetBatchComputerOptions: public NnetSimpleComputationOptions {
+  int32 minibatch_size;
+  int32 edge_minibatch_size;
+  bool ensure_exact_final_context;
+  BaseFloat partial_minibatch_factor;
+
+  NnetBatchComputerOptions(): minibatch_size(128),
+                              edge_minibatch_size(32),
+                              ensure_exact_final_context(false),
+                              partial_minibatch_factor(0.5) {
+  }
+
+  void Register(OptionsItf *po) {
+    NnetSimpleComputationOptions::Register(po);
+    po->Register("minibatch-size", &minibatch_size, "Number of chunks per "
+                 "minibatch (see also edge-minibatch-size)");
+    po->Register("edge-minibatch-size", &edge_minibatch_size, "Number of "
+                 "chunks per minibatch: this applies to chunks at the "
+                 "beginnings and ends of utterances, in cases (such as "
+                 "recurrent models) when the computation would be different "
+                 "from the usual one.");
+    po->Register("ensure-exact-final-context", &ensure_exact_final_context,
+                 "If true, for utterances shorter than --frames-per-chunk, "
+                 "use exact-length, special computations.  If false, "
+                 "pad with repeats of the last frame.  Would only affect "
+                 "the output for backwards-recurrent models, but would "
+                 "negatively impact speed in all cases.");
+    po->Register("partial-minibatch-factor", &partial_minibatch_factor,
+                 "Factor that controls how small partial minibatches will be "
+                 "they become necessary.  We will potentially do the computation "
+                 "for sizes: int(partial_minibatch_factor^n * minibatch_size "
+                 ", for n = 0, 1, 2....  Set it to 0.0 if you want to use "
+                 "only the specified minibatch sizes.");
+  }
+};
+
+
+/**
+   Merges together the 'output_cpu' (if the 'output_to_cpu' members are true) or
+   the 'output' members of 'tasks' into a single CPU matrix 'output'.  Requires that
+   those outputs are nonempty (i.e. that those tasks must have been completed).
+
+   @param [in] tasks  The vector of tasks whose outputs are to be merged.
+         The tasks must have already been completed.
+   @param [output  output  The spliced-together output matrix
+
+   TODO: in the future, maybe start from GPU and use pinned matrices for the
+   transfer.
+ */
+void MergeTaskOutput(
+    const std::vector<NnetInferenceTask> &tasks,
+    Matrix<BaseFloat> *output);
+
+/**
+   This class does neural net inference in a way that is optimized for GPU use:
+   it combines chunks of multiple utterances into minibatches for more efficient
+   computation.  It does the computation in one background thread that accesses
+   the GPU.  It is thread safe, i.e. you can call it from multiple threads
+   without having to worry about data races and the like.
+*/
+class NnetBatchComputer {
+ public:
+  /**  Constructor.  It stores references to all the arguments, so don't delete
+       them till this object goes out of scop.
+
+       \param [in] opts  Options struct
+       \param [in] nnet  The neural net which we'll be doing the computation with
+       \param [in] priors Either the empty vector, or a vector of prior
+                        probabilities which we'll take the log of and subtract
+                        from the neural net outputs (e.g. used in non-chain
+                        systems).
+   */
+  NnetBatchComputer(const NnetBatchComputerOptions &opts,
+                    const Nnet &nnet,
+                    const VectorBase<BaseFloat> &priors);
+
+
+  /// Accepts a task, meaning the task will be queued.  (Note: the pointer is
+  /// still owned by the caller.
+  /// If the max_minibatches_full >= 0, then the calling thread will block until
+  /// no more than that many full minibatches are waiting to be computed.  This
+  /// is a mechanism to prevent too many requests from piling up in memory.
+  void AcceptTask(NnetInferenceTask *task,
+                  int32 max_minibatches_full = -1);
+
+  /// Returns the number of full minibatches waiting to be computed.
+  int32 NumFullPendingMinibatches() const { return num_full_minibatches_; }
+
+
+  /**
+      Does some kind of computation, choosing the highest-priority thing to
+      compute.  It returns true if it did some kind of computation, and false
+      otherwise.  This function locks the class, but not for the entire time
+      it's being called: only at the beginning and at the end.
+        @param [in] allow_partial_minibatch  If false, then this will only
+              do the computation if a full minibatch is ready; if true, it
+              is allowed to do computation on partial (not-full) minibatches.
+   */
+  bool Compute(bool allow_partial_minibatch);
+
+
+  /**
+     Split a single utterance into a list of separate tasks which can then
+     be given to this class by AcceptTask().
+
+     @param [in] output_to_cpu  Will become the 'output_to_cpu' member of the
+             output tasks; this controls whether the computation code should transfer
+             the outputs to CPU (which is to save GPU memory).
+     @param [in] ivector  If non-NULL, and i-vector for the whole utterance is
+             expected to be supplied here (and online_ivectors should be NULL).
+             This is relevant if you estimate i-vectors per speaker instead of
+             online.
+     @param [in] online_ivectors  Matrix of ivectors, one every 'online_ivector_period' frames.
+     @param [in] online_ivector_period  Affects the interpretation of 'online_ivectors'.
+     @param [out]  tasks       The tasks created will be output to here.  The
+                      priorities will be set to zero; setting them to a meaningful
+                      value is up to the caller.
+  */
+  void SplitUtteranceIntoTasks(
+      bool output_to_cpu,
+      const Matrix<BaseFloat> &input,
+      const Vector<BaseFloat> *ivector,
+      const Matrix<BaseFloat> *online_ivectors,
+      int32 online_ivector_period,
+      std::vector<NnetInferenceTask> *tasks);
+
+  const NnetBatchComputerOptions &GetOptions() { return opts_; }
+
+  ~NnetBatchComputer();
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetBatchComputer);
+
+  // Information about a specific minibatch size for a group of tasks sharing a
+  // specific structure (in terms of left and right context, etc.)
+  struct MinibatchSizeInfo {
+    // the computation for this minibatch size.
+    std::shared_ptr<const NnetComputation> computation;
+    int32 num_done;  // The number of minibatches computed: for diagnostics.
+    int64 tot_num_tasks;  // The total number of tasks in those minibatches,
+    // also for diagnostics... can be used to compute
+    // how 'full', on average, these minibatches were.
+    double seconds_taken;  // The total time elapsed in computation for this
+                          // minibatch type.
+    MinibatchSizeInfo(): computation(NULL), num_done(0),
+                         tot_num_tasks(0), seconds_taken(0.0) { }
+  };
+
+
+  // A computation group is a group of tasks that have the same structure
+  // (number of input and output frames, left and right context).
+  struct ComputationGroupInfo {
+    // The tasks to be completed.  This array is added-to by AcceptTask(),
+    // and removed-from by GetHighestPriorityComputation(), which is called
+    // from Compute().
+    std::vector<NnetInferenceTask*> tasks;
+
+    // Map from minibatch-size to information specific to this minibatch-size,
+    // including the NnetComputation.  This is set up by
+    // GetHighestPriorityComputation(), which is called from Compute().
+    std::map<int32, MinibatchSizeInfo> minibatch_info;
+  };
+
+  // This struct allows us to arrange the tasks into groups that can be
+  // computed in the same minibatch.
+  struct ComputationGroupKey {
+    ComputationGroupKey(const NnetInferenceTask &task):
+        num_input_frames(task.input.NumRows()),
+        first_input_t(task.first_input_t),
+        num_output_frames(task.num_output_frames) {}
+
+    bool operator == (const ComputationGroupKey &other) const {
+      return num_input_frames == other.num_input_frames &&
+          first_input_t == other.first_input_t &&
+          num_output_frames == other.num_output_frames;
+    }
+    int32 num_input_frames;
+    int32 first_input_t;
+    int32 num_output_frames;
+  };
+
+  struct ComputationGroupKeyHasher {
+    int32 operator () (const ComputationGroupKey &key) const {
+      return key.num_input_frames + 18043 * key.first_input_t +
+          6413 * key.num_output_frames;
+    }
+  };
+
+
+  typedef unordered_map<ComputationGroupKey, ComputationGroupInfo,
+                        ComputationGroupKeyHasher> MapType;
+
+  // Gets the priority for a group, higher means higher priority.  (A group is a
+  // list of tasks that may be computed in the same minibatch).  What this
+  // function does is a kind of heuristic.
+  // If allow_partial_minibatch == false, it will set the priority for
+  // any minibatches that are not full to negative infinity.
+  inline double GetPriority(bool allow_partial_minibatch,
+                            const ComputationGroupInfo &info) const;
+
+  // Returns the minibatch size for this group of tasks, i.e. the size of a full
+  // minibatch for this type of task, which is what we'd ideally like to
+  // compute.  Note: the is_edge and is_irregular options should be the same
+  // for for all tasks in the group.
+  //   - If 'tasks' is empty or info.is_edge and info.is_irregular are both,
+  //     false, then return opts_.minibatch_size
+  //   - If 'tasks' is nonempty and tasks[0].is_irregular is true, then
+  //     returns 1.
+  //   - If 'tasks' is nonempty and tasks[0].is_irregular is false and
+  //     tasks[0].is_edge is true, then returns opts_.edge_minibatch_size.
+  inline int32 GetMinibatchSize(const ComputationGroupInfo &info) const;
+
+
+  // This function compiles, and returns, a computation for tasks of
+  // the structure present in info.tasks[0], and the specified minibatch
+  // size.
+  std::shared_ptr<const NnetComputation> GetComputation(
+      const ComputationGroupInfo &info,
+      int32 minibatch_size);
+
+
+  // Returns the actual minibatch size we'll use for this computation.  In most
+  // cases it will be opts_.minibatch_size (or opts_.edge_minibatch_size if
+  // appropriate; but if the number of available tasks is much less than the
+  // appropriate minibatch size, it may be less.  The minibatch size may be
+  // greater than info.tasks.size(); in that case, the remaining 'n' values in
+  // the minibatch are not used.  (It may also be less than info.tasks.size(),
+  // in which case we only do some of them).
+  int32 GetActualMinibatchSize(const ComputationGroupInfo &info) const;
+
+
+  // This function gets the highest-priority 'num_tasks' tasks from 'info',
+  // removes them from the array info->tasks, and puts them into the array
+  // 'tasks' (which is assumed to be initially empty).
+  // This function also updates the num_full_minibatches_ variable if
+  // necessary, and takes care of notifying any related condition variables.
+  void GetHighestPriorityTasks(
+      int32 num_tasks,
+      ComputationGroupInfo *info,
+      std::vector<NnetInferenceTask*> *tasks);
+
+  /**
+      This function finds and returns the computation corresponding to the
+      highest-priority group of tasks.
+
+       @param [in] allow_partial_minibatch  If this is true, then this
+             function may return a computation corresponding to a partial
+             minibatch-- i.e. the minibatch size in the computation may be
+             less than the minibatch size in the options class, and/or
+             the number of tasks may not be as many as the minibatch size
+             in the computation.
+       @param [out] minibatch_size  If this function returns non-NULL, then
+             this will be set to the minibatch size that the returned
+             computation expects.  This may be less than tasks->size(),
+             in cases where the minibatch was not 'full'.
+       @param [out] tasks  The tasks which we'll be doing the computation
+             for in this minibatch are put here (and removed from tasks_,
+             in cases where this function returns non-NULL.
+       @return  This function returns a pointer to the appropriate
+             'MinibatchSizeInfo' object corresponding to the computation
+             that we'll be doing for this minibatch, or NULL if there is nothing
+             to compute.
+  */
+  MinibatchSizeInfo *GetHighestPriorityComputation(
+      bool allow_partial_minibatch,
+      int32 *minibatch_size,
+      std::vector<NnetInferenceTask*> *tasks);
+
+  /**
+     formats the inputs to the computation and transfers them to GPU.
+        @param [in]  minibatch_size  The number of parallel sequences
+            we're doing this computation for.  This will be
+            more than tasks.size() in some cases.
+        @param [in] tasks  The tasks we're doing the computation for.
+            The input comes from here.
+        @param [out] input  The main feature input to the computation is
+            put into here.
+        @param [out] ivector  If we're using i-vectors, the i-vectors are
+            put here.
+  */
+  void FormatInputs(int32 minibatch_size,
+                    const std::vector<NnetInferenceTask*> &tasks,
+                    CuMatrix<BaseFloat> *input,
+                    CuMatrix<BaseFloat> *ivector);
+
+
+  // Copies 'output', piece by piece, to the 'output_cpu' or 'output'
+  // members of 'tasks', depending on their 'output_to_cpu' value.
+  void FormatOutputs(const CuMatrix<BaseFloat> &output,
+                     const std::vector<NnetInferenceTask*> &tasks);
+
+
+  // Changes opts_.frames_per_chunk to be a multiple of
+  // opts_.frame_subsampling_factor, if needed.
+  void CheckAndFixConfigs();
+
+  // this function creates and returns the computation request which is to be
+  // compiled.
+  static void GetComputationRequest(const NnetInferenceTask &task,
+                                    int32 minibatch_size,
+                                    ComputationRequest *request);
+
+  // Prints some logging information about what we computed, with breakdown by
+  // minibatch type.
+  void PrintMinibatchStats();
+
+  NnetBatchComputerOptions opts_;
+  const Nnet &nnet_;
+  CachingOptimizingCompiler compiler_;
+  CuVector<BaseFloat> log_priors_;
+
+  // Mutex that guards this object.  It is only held for fairly quick operations
+  // (not while the actual computation is being done).
+  std::mutex mutex_;
+
+  // tasks_ contains all the queued tasks.
+  // Each key contains a vector of NnetInferenceTask* pointers, of the same
+  // structure (i.e., IsCompatible() returns true).
+  MapType tasks_;
+
+  // num_full_minibatches_ is a function of the data in tasks_ (and the
+  // minibatch sizes, specified in opts_.  It is the number of full minibatches
+  // of tasks that are pending, meaning: for each group of tasks, the number of
+  // pending tasks divided by the minibatch-size for that group in integer
+  // arithmetic.  This is kept updated for thread synchronization reasons, because
+  // it is the shared variable
+  int32 num_full_minibatches_;
+
+  // a map from 'n' to a condition variable corresponding to the condition:
+  // num_full_minibatches_ <= n.  Any time the number of full minibatches drops
+  // below n, the corresponding condition variable is notified (if it exists).
+  std::unordered_map<int32, std::condition_variable*> no_more_than_n_minibatches_full_;
+
+  // some static information about the neural net, computed at the start.
+  int32 nnet_left_context_;
+  int32 nnet_right_context_;
+  int32 input_dim_;
+  int32 ivector_dim_;
+  int32 output_dim_;
+};
+
+
+/**
+   This class implements a simplified interface to class NnetBatchComputer,
+   which is suitable for programs like 'nnet3-compute' where you want to support
+   fast GPU-based inference on a sequence of utterances, and get them back
+   from the object in the same order.
+ */
+class NnetBatchInference {
+ public:
+
+  NnetBatchInference(
+      const NnetBatchComputerOptions &opts,
+      const Nnet &nnet,
+      const VectorBase<BaseFloat> &priors);
+
+  /**
+    The user should call this one by one for the utterances that this class
+    needs to compute (interspersed with calls to GetOutput()).  This call
+    will block when enough ready-to-be-computed data is present.
+
+      @param [in] utterance_id  The string representing the utterance-id;
+             it will be provided back to the user when GetOutput() is
+             called.
+      @param [in] input  The input features (e.g. MFCCs)
+      @param [in] ivector  If non-NULL, this is expected to be the
+             i-vector for this utterance (and 'online_ivectors' should
+             be NULL).
+      @param [in] online_ivector_period  Only relevant if
+             'online_ivector' is non-NULL, this says how many
+             frames of 'input' is covered by each row of
+             'online_ivectors'.
+  */
+  void AcceptInput(const std::string &utterance_id,
+                   const Matrix<BaseFloat> &input,
+                   const Vector<BaseFloat> *ivector,
+                   const Matrix<BaseFloat> *online_ivectors,
+                   int32 online_ivector_period);
+
+  /**
+     The user should call this after the last input has been provided
+     via AcceptInput().  This will force the last utterances to be
+     flushed out (to be retrieved by GetOutput()), rather than waiting
+     until the relevant minibatches are full.
+  */
+  void Finished();
+
+  /**
+      The user should call this to obtain output.  It's guaranteed to
+      be in the same order as the input was provided, but it may be
+      delayed.  'output' will be the output of the neural net, spliced
+      together over the chunks (and with acoustic scaling applied if
+      it was specified in the options; the subtraction of priors will
+      depend whether you supplied a non-empty vector of priors to the
+      constructor.
+
+      This call does not block (i.e. does not wait on any semaphores) unless you
+      have previously called Finished().  It returns true if it actually got any
+      output; if none was ready it will return false.
+  */
+  bool GetOutput(std::string *utterance_id,
+                 Matrix<BaseFloat> *output);
+
+  ~NnetBatchInference();
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetBatchInference);
+
+  // This is the computation thread, which is run in the background.  It will
+  // exit once the user calls Finished() and all computation is completed.
+  void Compute();
+  // static wrapper for Compute().
+  static void ComputeFunc(NnetBatchInference *object) { object->Compute(); }
+
+
+  // This object implements the internals of what this class does.  It is
+  // accessed both by the main thread (from where AcceptInput(), Finished() and
+  // GetOutput() are called), and from the background thread in which Compute()
+  // is called.
+  NnetBatchComputer computer_;
+
+  // This is set to true when the user calls Finished(); the computation thread
+  // sees it and knows to flush
+  bool is_finished_;
+
+  // This semaphore is signaled by the main thread (the thread in which
+  // AcceptInput() is called) every time a new utterance is added, and waited on
+  // in the background thread in which Compute() is called.
+  Semaphore tasks_ready_semaphore_;
+
+  struct UtteranceInfo {
+    std::string utterance_id;
+    // The tasks into which we split this utterance.
+    std::vector<NnetInferenceTask> tasks;
+    // 'num_tasks_finished' is the number of tasks which are known to be
+    // finished, meaning we successfully waited for those tasks' 'semaphore'
+    // member.  When this reaches tasks.size(), we are ready to consolidate
+    // the output into a single matrix and return it to the user.
+    size_t num_tasks_finished;
+  };
+
+  // This list is only accessed directly by the main thread, by AcceptInput()
+  // and GetOutput().  It is a list of utterances, with more recently added ones
+  // at the back.  When utterances are given to the user by GetOutput(),
+  std::list<UtteranceInfo*> utts_;
+
+  int32 utterance_counter_;  // counter that increases on every utterance.
+
+  // The thread running the Compute() process.
+  std::thread compute_thread_;
+};
+
+
+/**
+   Decoder object that uses multiple CPU threads for the graph search, plus a
+   GPU for the neural net inference (that's done by a separate
+   NnetBatchComputer object).  The interface of this object should
+   accessed from only one thread, though-- presumably the main thread of the
+   program.
+ */
+class NnetBatchDecoder {
+ public:
+  /**
+     Constructor.
+        @param [in] fst    FST that we are decoding with, will be shared between
+                           all decoder threads.
+        @param [in] decoder_config  Configuration object for the decoders.
+        @param [in] trans_model   The transition model-- needed to construct the decoders,
+                           and for determinization.
+        @param [in] word_syms  A pointer to a symbol table of words, used for printing
+                          the decoded words to stderr.  If NULL, the word-level output will not
+                          be logged.
+        @param [in] allow_partial   If true, in cases where no final-state was reached
+                           on the final frame of the decoding, we still output a lattice;
+                           it just may contain partial words (words that are cut off in
+                           the middle).  If false, we just won't output anything for
+                           those lattices.
+        @param [in] num_threads  The number of decoder threads to use.  It will use
+                          two more threads on top of this: the main thread, for I/O,
+                          and a thread for possibly-GPU-based inference.
+        @param [in] computer The NnetBatchComputer object, through which the
+                           neural net will be evaluated.
+   */
+  NnetBatchDecoder(const fst::Fst<fst::StdArc> &fst,
+                   const LatticeFasterDecoderConfig &decoder_config,
+                   const TransitionModel &trans_model,
+                   const fst::SymbolTable *word_syms,
+                   bool allow_partial,
+                   int32 num_threads,
+                   NnetBatchComputer *computer);
+
+  /**
+    The user should call this one by one for the utterances that
+    it needs to compute (interspersed with calls to GetOutput()).  This
+    call will block when no threads are ready to start processing this
+    utterance.
+
+      @param [in] utterance_id  The string representing the utterance-id;
+             it will be provided back to the user when GetOutput() is
+             called.
+      @param [in] input  The input features (e.g. MFCCs)
+      @param [in] ivector  If non-NULL, this is expected to be the
+             i-vector for this utterance (and 'online_ivectors' should
+             be NULL).
+      @param [in] online_ivector_period  Only relevant if
+             'online_ivector' is non-NULL, this says how many
+             frames of 'input' is covered by each row of
+             'online_ivectors'.
+  */
+  void AcceptInput(const std::string &utterance_id,
+                   const Matrix<BaseFloat> &input,
+                   const Vector<BaseFloat> *ivector,
+                   const Matrix<BaseFloat> *online_ivectors,
+                   int32 online_ivector_period);
+
+  /*
+    The user should call this function each time there was a problem with an utterance
+    prior to being able to call AcceptInput()-- e.g. missing i-vectors.  This will
+    update the num-failed-utterances stats which are stored in this class.
+   */
+  void UtteranceFailed();
+
+  /*
+     The user should call this when all input has been provided, e.g.
+     when AcceptInput will not be called any more.  It will block until
+     all threads have terminated; after that, you can call GetOutput()
+     until it returns false, which will guarantee that nothing remains
+     to compute.
+     It returns the number of utterances that have been successfully decoded.
+   */
+  int32 Finished();
+
+  /**
+      The user should call this to obtain output (This version should
+      only be called if config.determinize_lattice == true (w.r.t. the
+      config provided to the constructor).  The output is guaranteed to
+      be in the same order as the input was provided, but it may be
+      delayed, *and* some outputs may be missing, for example because
+      of search failures (allow_partial will affect this).
+
+      The acoustic scores in the output lattice will already be divided by
+      the acoustic scale we decoded with.
+
+      This call does not block (i.e. does not wait on any semaphores).  It
+      returns true if it actually got any output; if none was ready it will
+      return false.
+         @param [out] utterance_id  If an output was ready, its utterance-id is written to here.
+         @param [out] clat  If an output was ready, it compact lattice will be
+                            written to here.
+         @param [out] sentence  If an output was ready and a nonempty symbol table
+                            was provided to the constructor of this class, contains
+                            the word-sequence decoded as a string.  Otherwise will
+                            be empty.
+         @return  Returns true if a decoded output was ready.  (These appear asynchronously
+                  as the decoding is done in background threads).
+  */
+  bool GetOutput(std::string *utterance_id,
+                 CompactLattice *clat,
+                 std::string *sentence);
+
+  // This version of GetOutput is for where config.determinize_lattice == false
+  // (w.r.t. the config provided to the constructor).  It is the same as the
+  // other version except it outputs to a normal Lattice, not a CompactLattice.
+  bool GetOutput(std::string *utterance_id,
+                 Lattice *lat,
+                 std::string *sentence);
+
+  ~NnetBatchDecoder();
+
+ private:
+  KALDI_DISALLOW_COPY_AND_ASSIGN(NnetBatchDecoder);
+
+  struct UtteranceInput {
+    std::string utterance_id;
+    const Matrix<BaseFloat> *input;
+    const Vector<BaseFloat> *ivector;
+    const Matrix<BaseFloat> *online_ivectors;
+    int32 online_ivector_period;
+  };
+
+  // This object is created when a thread finished an utterance.  For utterances
+  // where decoding failed somehow, the relevant lattice (compact_lat, if
+  // opts_.determinize == true, or lat otherwise) will be empty (have no
+  // states).
+  struct UtteranceOutput {
+    std::string utterance_id;
+    bool finished;
+    CompactLattice compact_lat;
+    Lattice lat;
+    std::string sentence;  // 'sentence' is only nonempty if a non-NULL symbol
+                           // table was provided to the constructor of class
+                           // NnetBatchDecoder; it's the sentence as a string (a
+                           // sequence of words separated by space).  It's used
+                           // for printing the sentence to stderr, which we do
+                           // in the main thread to keep the order consistent.
+  };
+
+  // This is the decoding thread, several copies of which are run in the
+  // background.  It will exit once the user calls Finished() and all
+  // computation is completed.
+  void Decode();
+  // static wrapper for Compute().
+  static void DecodeFunc(NnetBatchDecoder *object) { object->Decode(); }
+
+  // This is the computation thread; it handles the neural net inference.
+  void Compute();
+  // static wrapper for Compute().
+  static void ComputeFunc(NnetBatchDecoder *object) { object->Compute(); }
+
+
+  // Sets the priorities of the tasks in a newly provided utterance.
+  void SetPriorities(std::vector<NnetInferenceTask> *tasks);
+
+  // In the single-thread case, this sets priority_offset_ to 'priority'.
+  // In the multi-threaded case it causes priority_offset_ to approach
+  // 'priority' at a rate that depends on the nunber of threads.
+  void UpdatePriorityOffset(double priority);
+
+  // This function does the determinization (if needed) and finds the best path through
+  // the lattice to update the stats.  It is expected that when it is called, 'output' must
+  // have its 'lat' member set up.
+  void ProcessOutputUtterance(UtteranceOutput *output);
+
+  const fst::Fst<fst::StdArc> &fst_;
+  const LatticeFasterDecoderConfig &decoder_opts_;
+  const TransitionModel &trans_model_;
+  const fst::SymbolTable *word_syms_;  // May be NULL.  Owned here.
+  bool allow_partial_;
+  NnetBatchComputer *computer_;
+  std::vector<std::thread*> decode_threads_;
+  std::thread compute_thread_;  // Thread that calls computer_->Compute().
+
+
+  // 'input_utterance', together with utterance_ready_semaphore_ and
+  // utterance_consumed_semaphore_, use used to 'hand off' information about a
+  // newly provided utterance from AcceptInput() to a decoder thread that is
+  // ready to process a new utterance.
+  UtteranceInput input_utterance_;
+  Semaphore input_ready_semaphore_;  // Is signaled by the main thread when
+                                     // AcceptInput() is called and a new
+                                     // utterance is being provided (or when the
+                                     // input is finished), and waited on in
+                                     // decoder thread.
+  Semaphore input_consumed_semaphore_;  // Is signaled in decoder thread when it
+                                        // has finished consuming the input, so
+                                        // the main thread can know when it
+                                        // should continue (to avoid letting
+                                        // 'input' go out of scope while it's
+                                        // still needed).
+
+  Semaphore tasks_ready_semaphore_; // Is signaled when new tasks are added to
+                                    // the computer_ object (or when we're finished).
+
+  bool is_finished_;  // True if the input is finished.  If this is true, a
+                      // signal to input_ready_semaphore_ indicates to the
+                      // decoder thread that it should terminate.
+
+  bool tasks_finished_;  // True if we know that no more tasks will be given
+                         // to the computer_ object.
+
+
+  // pending_utts_ is a list of utterances that have been provided via
+  // AcceptInput(), but their decoding has not yet finished.  AcceptInput() will
+  // push_back to it, and GetOutput() will pop_front().  When a decoding thread
+  // has finished an utterance it will set its 'finished' member to true.  There
+  // is no need to synchronize or use mutexes here.
+  std::list<UtteranceOutput*> pending_utts_;
+
+  // priority_offset_ is something used in determining the priorities of nnet
+  // computation tasks.  It starts off at zero and becomes more negative with
+  // time, with the aim being that the priority of the first task (i.e. the
+  // leftmost chunk) of a new utterance should be at about the same priority as
+  // whatever chunks we are just now getting around to decoding.
+  double priority_offset_;
+
+  // Some statistics accumulated by this class, for logging and timing purposes.
+  double tot_like_;  // Total likelihood (of best path) over all lattices that
+                     // we output.
+  int64 frame_count_;  // Frame count over all latices that we output.
+  int32 num_success_;  // Number of successfully decoded files.
+  int32 num_fail_;  // Number of files where decoding failed.
+  int32 num_partial_;  // Number of files that were successfully decoded but
+                       // reached no final-state (can only be nonzero if
+                       // allow_partial_ is true).
+  std::mutex stats_mutex_;  // Mutex that guards the statistics from tot_like_
+                            // through num_partial_.
+  Timer timer_;  // Timer used to print real-time info.
+};
+
+
+}  // namespace nnet3
+}  // namespace kaldi
+
+#endif  // KALDI_NNET3_NNET_BATCH_COMPUTE_H_
diff --git a/src/nnet3/nnet-compile-utils.cc b/src/nnet3/nnet-compile-utils.cc
index 49012e08884..b1f9d0b0e2b 100644
--- a/src/nnet3/nnet-compile-utils.cc
+++ b/src/nnet3/nnet-compile-utils.cc
@@ -25,351 +25,164 @@
 
 namespace kaldi {
 namespace nnet3 {
-// this comparator will be used to sort pairs using first_element
-// we declare it as a struct as it will also be used by std::lower_bound
-// method which will supply elements of different types to the function
-struct FirstElementComparator {
-  int first_element(int32 t) const {
-    return t;
-  }
-
-  int first_element(std::pair<int32, int32> t) const  {
-    return t.first;
-  }
 
-  template< typename T1, typename T2>
-  bool operator()( T1 const & t1, T2 const & t2) const  {
-    return first_element(t1) < first_element(t2);
-  }
-};
-
-// This comparator is used with std::find_if function to search for pairs
-// whose first element is equal to the given pair
-struct FirstElementIsEqualComparator :
-      public std::unary_function<std::pair<int32, int32>, bool>
-{
-  explicit FirstElementIsEqualComparator(const int32 element):
-      element_(element) {}
-  bool operator() (std::pair<int32, int32> const &arg)
-  { return (arg.first == element_); }
-  int32 element_;
-};
-
-// This comparator is used with std::find_if function to search for pairs
-// whose .first and .second elements are equal to the given pair
-struct PairIsEqualComparator  :
-      public std::unary_function<std::pair<int32, int32>, bool>
-{
-  explicit PairIsEqualComparator(const std::pair<int32, int32> pair):
-      pair_(pair) {}
-  bool operator() (std::pair<int32, int32> const &arg)
-  {
-    if (pair_.first == arg.first)
-      return pair_.second == arg.second;
-    return false;
+/**
+   Gets counts of submatrices (the 1st members of pairs) in submat_lists.
+   Also outputs, to 'submats_with_large_counts', a list of submatrix indexes
+   that have counts over half of submat_lists.size().  (These will be separated
+   out into their own AddRows() commands).
+ */
+void GetSubmatCounts(
+    const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
+    std::unordered_map<int32,int32> *submat_counts,
+    std::vector<int32> *submats_with_large_counts) {
+  auto iter = submat_lists.begin(), end = submat_lists.end();
+  for (; iter != end; ++iter) {
+    std::vector<std::pair<int32, int32> >::const_iterator
+        iter2 = iter->begin(), end2 = iter->end();
+    for (; iter2 != end2; ++iter2) {
+      int32 submat_index = iter2->first;
+      KALDI_ASSERT(submat_index >= 0);  // We don't expect -1's in submat_lists.
+      std::unordered_map<int32,int32>::iterator
+          iter = submat_counts->find(submat_index);
+      if (iter == submat_counts->end())
+        (*submat_counts)[submat_index] = 1;
+      else
+        iter->second++;
+    }
   }
-  std::pair<int32, int32> pair_;
-};
-
-// this comparator will be used to sort pairs initially by second element in
-// descending order and then by first element in descending order.
-// note, std::sort accepts an actual function as an alternative to a
-// function object.
-bool  SecondElementComparator(const std::pair<int32, int32>& first_pair,
-                              const std::pair<int32, int32>& second_pair) {
-  if (first_pair.second == second_pair.second)
-    return first_pair.first > second_pair.first;
-  return first_pair.second > second_pair.second;
+  auto counts_iter = submat_counts->begin(),
+      counts_end = submat_counts->end();
+  size_t cutoff = submat_lists.size() / 2;
+  for (; counts_iter != counts_end; ++counts_iter)
+    if (counts_iter->second > cutoff)
+      submats_with_large_counts->push_back(counts_iter->first);
 }
 
-// Function to sort the lists in a vector of lists of pairs, by the first
-// element of the pair
-void SortSubmatLists(
-    // vector of list of location pairs
-    const std::vector<std::vector<std::pair<int32, int32> > > submat_lists,
-    // a copy of the input submat_lists where the lists are sorted
-    // (this will be used in the caller function for sort and find functions)
-    std::vector<std::vector<std::pair<int32, int32> > > * sorted_submat_lists,
-    // maximum size of the submat_lists
-    int32* max_submat_list_size
-    )
-{
-  *max_submat_list_size = 0;
-  sorted_submat_lists->reserve(submat_lists.size());
-  KALDI_ASSERT(submat_lists.size() > 0);
-  for (int32 i = 0; i < submat_lists.size(); i++) {
-    if (submat_lists[i].size() > *max_submat_list_size)
-      *max_submat_list_size = submat_lists[i].size();
-    sorted_submat_lists->push_back(submat_lists[i]);
-    std::sort((*sorted_submat_lists)[i].begin(),
-              (*sorted_submat_lists)[i].end(),
-              FirstElementComparator());
+/**
+   This function, used in SplitLocations(), is used to make separate
+   'split lists' for certain high-count submatrix indexes, specified by
+   the user in 'submats_to_separate'.  These split
+   lists will be lists of pairs that are all either (-1, 1) or (submatrix_index, x)
+   for a particular submatrix index (constant within the split list).
+   These high-count lists will be written to 'split_lists'; they
+   will eventually compile to AddRows() commands.  We write the remaining
+   members of the lists in 'submat_lists' (the ones that did not make it
+   into 'split_lists') to 'reduced_submat_lists'.
+ */
+void SeparateSubmatsWithLargeCounts(
+    const std::vector<int32> &submats_to_separate,
+    const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
+    std::vector<std::vector<std::pair<int32, int32> > > *reduced_submat_lists,
+    std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
+  KALDI_ASSERT(split_lists->empty() && !submats_to_separate.empty());
+  size_t num_to_separate = submats_to_separate.size(),
+      num_rows = submat_lists.size();
+  std::unordered_map<int32, size_t> submat_to_index;
+  reduced_submat_lists->clear();
+  reduced_submat_lists->resize(num_rows);
+  split_lists->resize(num_to_separate);
+  for (size_t i = 0; i < num_to_separate; i++) {
+    (*split_lists)[i].resize(num_rows, std::pair<int32, int32>(-1, -1));
+    int32 submat = submats_to_separate[i];
+    submat_to_index[submat] = i;
   }
-}
-
-// Function to compute a histogram of the submat_index,
-// which is the first_element in the location pair, given vector of list of
-// location pairs
-void ComputeSubmatIndexHistogram(
-    // vector of list of pairs of location pairs where the lists are sorted
-    // by submat_indexes (.first element)
-    const std::vector<std::vector<std::pair<int32, int32> > >
-    sorted_submat_lists,
-    // a histogram of submat_indexes where
-    // the keys are submat_indexes and values are a vector of frequencies
-    // of first occurrence, second occurrence, etc. of a submat_index
-    // in a submat_list
-    unordered_map<int32, std::vector<int32> >* submat_histogram
-    ) {
-  KALDI_ASSERT(sorted_submat_lists.size() > 0);
-  // computing the submat_histogram
-  // counting the occurrences of each element in the current submat_list;
-  // each new occurrence of the same element, in this list, is counted
-  // as a seperate symbol for frequency counts
-  for (int32 i = 0; i < sorted_submat_lists.size(); i++) {
-    int j = 0;
-    unordered_map<int32, std::vector<int32> >::iterator histogram_iterator
-        = submat_histogram->end();
-    int32 repetition_count = 0;
-    while (j < sorted_submat_lists[i].size()) {
-      if ((histogram_iterator == submat_histogram->end()) ||
-          (histogram_iterator->first != sorted_submat_lists[i][j].first)) {
-        histogram_iterator =
-            submat_histogram->find(sorted_submat_lists[i][j].first);
-        repetition_count = 0;
-        // if a histogram entry was not found for this submat_index, add one
-        if (histogram_iterator == submat_histogram->end()) {
-          (*submat_histogram)[sorted_submat_lists[i][j].first];
-          histogram_iterator = submat_histogram->find(
-              sorted_submat_lists[i][j].first);
-        }
+  for (size_t row = 0; row < submat_lists.size(); row++) {
+    std::vector<std::pair<int32, int32> >::const_iterator
+        iter = submat_lists[row].begin(), end = submat_lists[row].end();
+    std::vector<std::pair<int32, int32> >
+        &reduced_list = (*reduced_submat_lists)[row];
+    // 'reduced_lists' will contain the pairs that don't make it into
+    // 'split_lists'.
+    for (; iter != end; ++iter) {
+      int32 submat_index = iter->first;
+      std::unordered_map<int32, size_t>::const_iterator map_iter =
+          submat_to_index.find(submat_index);
+      if (map_iter == submat_to_index.end()) { // not a large-count submatrix.
+        reduced_list.push_back(*iter);
+        continue;
       }
-
-      if (repetition_count >= (histogram_iterator->second).size()) {
-        // this is the first time the submat_index repeated this many times
-        // so add an entry for this in the count vector
-        (histogram_iterator->second).push_back(1);
-      } else {
-        (histogram_iterator->second)[repetition_count]++;
+      size_t index = map_iter->second;
+      std::pair<int32,int32> &p = (*split_lists)[index][row];
+      if (p.first >= 0) {
+        // we'd only reach here if the same submat index repeated in the same
+        // row, which is possible but rare.
+        reduced_list.push_back(*iter);
+        continue;
       }
-      repetition_count++;
-      j++;
-    }
-  }
-}
-
-
-// Function to find the first occurrence of a submat_index in list of location
-// pairs from a vector of list of locations pairs.
-// The occurrences are returned as a list of vector iterators,
-// pointing to the position of the pair in the list or to the
-// end of the list (when the pair is not present)
-void FindSubmatIndexInSubmatLists(
-    // submat_index to search in the submat_lists
-    int32 submat_index,
-    // sorted_submat_lists is a pointer as we want non-const iterators in the
-    // output
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    // a vector of iterators to store the location of the pairs
-    std::vector<std::vector<std::pair<int32, int32> >::iterator>
-    *output_iterator_list,
-    // the max size of the submat_lists if the found pairs have been removed
-    int32 *max_remaining_submat_list_size) {
-
-  output_iterator_list->reserve(sorted_submat_lists->size());
-  *max_remaining_submat_list_size = 0;
-  for (int32 i = 0; i < sorted_submat_lists->size(); i++)  {
-    std::vector< std::pair<int32, int32> > & submat_list =
-        (*sorted_submat_lists)[i];
-    output_iterator_list->push_back(
-        std::find_if(submat_list.begin(), submat_list.end(),
-                     FirstElementIsEqualComparator(submat_index)));
-    int32 remaining_submat_list_size = submat_list.size();
-    if  (output_iterator_list->back() != submat_list.end())  {
-      // since the submat_index is present in this submat_list
-      // if submat_index was deleted from the list
-      // the remaining submat_list's size is reduced by 1
-      remaining_submat_list_size--;
-    }
-    *max_remaining_submat_list_size = remaining_submat_list_size
-                                      > *max_remaining_submat_list_size ? remaining_submat_list_size :
-                                      *max_remaining_submat_list_size;
-  }
-}
-
-// Function to extract the identified pairs (identified with an iterator)
-// from a vector of list of pairs, "to extract" means to copy into
-// a list and erase the original pair from the submat_lists
-void ExtractGivenPairsFromSubmatLists(
-    std::vector<std::vector<std::pair<int32, int32> >::iterator>
-    &input_iterator_list,
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    std::vector<std::pair<int32, int32> > *list_of_pairs) {
-  list_of_pairs->reserve(sorted_submat_lists->size());
-  for (int32 i = 0; i < input_iterator_list.size(); i++) {
-    if (input_iterator_list[i] != (*sorted_submat_lists)[i].end()) {
-      // there was an element with the submat_index in the current list
-      list_of_pairs->push_back(*input_iterator_list[i]);
-      (*sorted_submat_lists)[i].erase(input_iterator_list[i]);
-    } else  {
-      // insert a dummy element. Callers of this function expect the dummy
-      // element to be (-1, -1)
-      list_of_pairs->push_back(std::make_pair(-1, -1));
-    }
-  }
-}
-
-// Function to extract the last pairs from a vector of list of pairs
-// a dummy is added when the list is empty
-static void ExtractLastPairFromSubmatLists(
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    std::vector<std::pair<int32, int32> > *list_of_pairs) {
-  list_of_pairs->reserve(sorted_submat_lists->size());
-  for (int32 i = 0; i < sorted_submat_lists->size(); i++) {
-    if ((*sorted_submat_lists)[i].size() == 0)  {
-      // the value of the dummy has to be (-1, -1) as down stream code has
-      // expects -1 values for dummies
-      list_of_pairs->push_back(std::make_pair(-1, -1));
-      continue;
+      p.first = submat_index;
+      int32 src_row_index = iter->second;
+      p.second = src_row_index;
     }
-    list_of_pairs->push_back((*sorted_submat_lists)[i].back());
-    (*sorted_submat_lists)[i].pop_back();
   }
 }
 
-// Function which does the actual splitting of submat_lists. But it operates on
-// sorted submat_lists and uses submat_histogram_vector.
-// See SplitLocations, below for the algorithm
-static void SplitLocationsUsingSubmatHistogram(
-    // maximum size of the lists in the sorted_submat_lists
-    int32 max_submat_list_size,
-    // a vector of list of pairs where each list is expected to be sorted
-    // this is a pointer as the lists will be modified
-    std::vector<std::vector<std::pair<int32, int32> > > *sorted_submat_lists,
-    // a vector of pairs to represent a histogram
-    // this is a pointer as the vector will be sorted
-    std::vector<std::pair<int32, int32> > *submat_histogram_vector,
-    // a vector of lists of pairs with rearranged pairs
-    std::vector<std::vector<std::pair<int32, int32> > > *split_lists)  {
-
-  // sort the submat_histogram_vector based on second element of pair
-  // in descending order then first element of pair in descending order
-  std::sort(submat_histogram_vector->begin(),
-            submat_histogram_vector->end(), SecondElementComparator);
-
-  int32 prev_max_remaining_submat_list_size = max_submat_list_size;
-  std::vector<std::pair<int32, int32> >::iterator iter;
-  for (iter = submat_histogram_vector->begin();
-       iter != submat_histogram_vector->end();
-       ++iter)  {
-    std::pair<int32, int32> submat_index_and_count = *iter;
-    std::vector<std::vector<std::pair<int32, int32> >::iterator>
-        output_iterator_list;
-    int32 max_remaining_submat_list_size = 0;
-    FindSubmatIndexInSubmatLists(submat_index_and_count.first,
-                                 sorted_submat_lists,
-                                 &output_iterator_list,
-                                 &max_remaining_submat_list_size);
-    if (max_remaining_submat_list_size
-        < prev_max_remaining_submat_list_size)  {
-      // since we will have a smaller max_remaining_submat_list_size by
-      // splitting this submat_index into a seperate list,
-      // we will split it;
-      std::vector<std::pair<int32, int32> > list_of_pairs;
-      ExtractGivenPairsFromSubmatLists(output_iterator_list,
-                                       sorted_submat_lists,
-                                       &list_of_pairs);
-      split_lists->push_back(list_of_pairs);
-      prev_max_remaining_submat_list_size = max_remaining_submat_list_size;
-    }
-  }
-
-  // rearrange the remaining pairs into lists where
-  // pairs with multiple first elements are allowed
-  // Note : we don't yet know if there is any advantage of having multiple
-  // calls to the same submat in kAddRowsMulti. If this is actually helpful
-  // then use the sorted_histogram_vector to first copy submat_indexes which
-  // did not make it to kAddRows calls
-  for (int32 i = 0; i < prev_max_remaining_submat_list_size; i++) {
-    std::vector<std::pair<int32, int32> > list_of_pairs;
-    ExtractLastPairFromSubmatLists(sorted_submat_lists,
-                                   &list_of_pairs);
-    split_lists->push_back(list_of_pairs);
-  }
-}
-
-// Function rearranges the submat_lists (see nnet-compute-utils.h for
-// description of submat_lists), into lists that can be used as inputs
-// for kAddRows and kAddRowsMulti calls.
-// kAddRows requires a list of pairs where all the first elements correspond to
-// the same submat_index.
-// kAddRowsMulti uses a list of pairs where the first elements can correspond to
-// multiple submat_index locations.
-// ------------------------
-// The maximum size of a list in submat_lists is the minimum number of
-// kAddRowsMulti calls necessary.
-// In the current implementation we replace kAddRowsMulti calls with
-// kAddRows calls wherever possible, while not increasing the number of calls.
-//
-// Algorithm :
-// The function computes a histogram of submat_indexes and spans through the
-// submat_indexes in descending order of frequency. For each submat_index a
-// decision is made to copy it using a kAddRows call or not.
-// A kAddRow call is made for a submat_index if splitting it into a seperate
-// list reduces the max_submat_list_size by one, i.e., reduces the number of
-// remaining kAddRowsMulti calls.
-// submat_indexes which cannot be assigned to kAddRow calls are rearranged into
-// lists for kAddRowsMulti calls.
-//
-// Note : To decide splits we could have solved a combinatorial
-// optimization problem where we find the best set of
-// kAddRows + kAddRowsMulti calls;
-// but given that both these calls have similar costs,
-// and that the average number of elements in a submat_list is around 4,
-// it does not make sense to
-// choose a kAddRows call unless it is able to immediately reduce a
-// kAddRowsMulti call. So we simplify the process and stay away from any
-// complex search algorithms. We might implement a solution where a more
-// elaborate search is done,if the length of the lists increases
-// for newer NN architectures, as even minor savings in speed due to increased
-// number of kAddRows calls can accumulate compensating for the additional calls
-
 void SplitLocations(
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists) {
+  size_t num_rows = submat_lists.size(),
+      num_output_lists = 0;
+  auto iter = submat_lists.begin(), end = submat_lists.end();
+  for (; iter != end; ++iter)
+    if (iter->size() > num_output_lists)
+      num_output_lists = iter->size();
+  split_lists->clear();
+  if (num_output_lists == 0)  // Odd, but could happen, maybe
+    return;
+  else if (num_output_lists == 1) {
+    split_lists->resize(1);
+    std::vector<std::pair<int32, int32> > &list = (*split_lists)[0];
+    list.resize(num_rows, std::pair<int32, int32>(-1, -1));
+    for (size_t i = 0; i < num_rows; i++) {
+      if (!submat_lists[i].empty())
+        list[i] = submat_lists[i][0];
+    }
+    return;
+  }
 
-  // a histogram of the submat_indexes in the submat_lists
-  // each occurence in a given submat_list is considered unique so we maintain
-  // a vector to count each occurrence separately.
-  // The i'th element in the vector corresponds to the count of
-  // the (i+1)'th occurrence of a submat_index in a submat_list
-  unordered_map<int32, std::vector<int32> > submat_histogram;
-
-  int32 max_submat_list_size = 0;
-
-  // initializing a vector of list of pairs to store the sorted submat_lists
-  std::vector<std::vector< std::pair<int32, int32> > >
-      sorted_submat_lists;
-  SortSubmatLists(submat_lists, &sorted_submat_lists, &max_submat_list_size);
-  ComputeSubmatIndexHistogram(sorted_submat_lists, &submat_histogram);
-  // the vector has same information as the submat_histogram, but it is
-  // suitable for sorting according to frequency. The first elements of pairs
-  // can be repeated, these correspond to different occurrences in the same list
-  std::vector<std::pair<int32, int32> > submat_histogram_vector;
-  // copy the key, occurence_counts from submat_histogram to a vector
-  unordered_map<int32, std::vector<int32> >::iterator hist_iter;
-  for (hist_iter = submat_histogram.begin();
-       hist_iter != submat_histogram.end();
-       ++hist_iter) {
-    for (int32 i = 0; i < (hist_iter->second).size(); i++)  {
-      submat_histogram_vector.push_back(
-          std::make_pair(hist_iter->first, (hist_iter->second)[i]));
+  // counts for each submatrix index, of how many times it occurs.
+  std::unordered_map<int32,int32> submat_counts;
+  std::vector<int32> submats_with_large_counts;
+  GetSubmatCounts(submat_lists, &submat_counts, &submats_with_large_counts);
+  if (!submats_with_large_counts.empty()) {
+    // There are submatrices with counts over half the num-rows.  We assign these
+    // their own output lists.
+
+    std::vector<std::vector<std::pair<int32, int32> > > reduced_submat_lists;
+    SeparateSubmatsWithLargeCounts(submats_with_large_counts,
+                                   submat_lists,
+                                   &reduced_submat_lists,
+                                   split_lists);
+    // 'reduced_split_lists' is the result of recursing with input 'reduced_submat_lists';
+    // we'll append its result to 'split_lists'.
+    std::vector<std::vector<std::pair<int32, int32> > > reduced_split_lists;
+    SplitLocations(reduced_submat_lists, &reduced_split_lists);
+    size_t cur_num_lists = split_lists->size(),
+        num_extra_lists = reduced_split_lists.size(),
+        new_num_lists = cur_num_lists + num_extra_lists;
+    split_lists->resize(new_num_lists);
+    for (size_t i = 0; i < num_extra_lists; i++)
+      (*split_lists)[cur_num_lists + i].swap(reduced_split_lists[i]);
+    return;
+    // and we're done.
+  } else {
+    // All the counts of submatrix indexes seem to be small so we are resigned to
+    // only using AddRowsMulti commands.
+    split_lists->resize(num_output_lists);
+    for (size_t i = 0; i < num_output_lists; i++)
+      (*split_lists)[i].resize(num_rows, std::pair<int32, int32>(-1, -1));
+    for (size_t row = 0; row < num_rows; row++) {
+      const std::vector<std::pair<int32, int32> > &this_list =
+          submat_lists[row];
+      size_t this_list_size = submat_lists[row].size();
+      for (size_t i = 0; i < this_list_size; i++) {
+        (*split_lists)[i][row] = this_list[i];
+      }
     }
   }
-  SplitLocationsUsingSubmatHistogram(max_submat_list_size, &sorted_submat_lists,
-                                     &submat_histogram_vector, split_lists);
 }
 
+
 /* If it is the case for some i >= 0 that all the .first elements of
    "location_vector" are either i or -1, then output i to first_value and the
    .second elements into "second_values", and return true.  Otherwise return
diff --git a/src/nnet3/nnet-compile-utils.h b/src/nnet3/nnet-compile-utils.h
index 124f40f3421..e21f81aecdd 100644
--- a/src/nnet3/nnet-compile-utils.h
+++ b/src/nnet3/nnet-compile-utils.h
@@ -32,11 +32,15 @@ namespace nnet3 {
 
 
 /**
-   The input to this function is a vector of lists of pairs, and this function
-   splits it up into a list of vectors of pairs.  In order to make the lists all
-   the same length it may have to insert "dummy" pairs with value (-1, -1).
-   In addition, this function implement certain heuristics to break up the
-   list into pairs in a particular desirable way, which we will describe below.
+   The input to this function is a vector (indexed by matrix-row-index) of lists
+   of pairs (submat_index, row_index), and this function splits it up into a
+   list of vectors of pairs, where those vectors are indexed by
+   matrix-row-index.
+
+   In order to make the lists all the same length it may have to insert "dummy"
+   pairs with value (-1, -1).  In addition, this function implement certain
+   heuristics to break up the list into pairs in a particular desirable way,
+   which we will describe below.
 
    Let the input be `submat_lists`, and let `num_rows = submat_lists.size()`.
    The value -1 is not expected to appear as either the .first or .second
@@ -74,7 +78,6 @@ namespace nnet3 {
 
    See documentation here: \ref dnn3_compile_compiler_split_locations
  */
-
 void SplitLocations(
     const std::vector<std::vector<std::pair<int32, int32> > > &submat_lists,
     std::vector<std::vector<std::pair<int32, int32> > > *split_lists);
@@ -179,4 +182,3 @@ void GetNxList(const std::vector<Index> &indexes,
 
 
 #endif
-
diff --git a/src/nnet3bin/Makefile b/src/nnet3bin/Makefile
index d187a7b61aa..67d15d3c38a 100644
--- a/src/nnet3bin/Makefile
+++ b/src/nnet3bin/Makefile
@@ -19,7 +19,7 @@ BINFILES = nnet3-init nnet3-info nnet3-get-egs nnet3-copy-egs nnet3-subset-egs \
    nnet3-discriminative-subset-egs nnet3-get-egs-simple \
    nnet3-discriminative-compute-from-egs nnet3-latgen-faster-looped \
    nnet3-egs-augment-image nnet3-xvector-get-egs nnet3-xvector-compute \
-   nnet3-latgen-grammar
+   nnet3-latgen-grammar nnet3-compute-batch nnet3-latgen-faster-batch
 
 OBJFILES =
 
@@ -32,7 +32,7 @@ ADDLIBS = ../nnet3/kaldi-nnet3.a ../chain/kaldi-chain.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/nnet3bin/nnet3-compute-batch.cc b/src/nnet3bin/nnet3-compute-batch.cc
new file mode 100644
index 00000000000..b0001c96f57
--- /dev/null
+++ b/src/nnet3bin/nnet3-compute-batch.cc
@@ -0,0 +1,204 @@
+// nnet3bin/nnet3-compute-batch.cc
+
+// Copyright 2012-2018   Johns Hopkins University (author: Daniel Povey)
+//           2018        Hang Lyu
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/kaldi-common.h"
+#include "util/common-utils.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "base/timer.h"
+#include "nnet3/nnet-utils.h"
+
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Propagate the features through raw neural network model "
+        "and write the output.  This version is optimized for GPU use. "
+        "If --apply-exp=true, apply the Exp() function to the output "
+        "before writing it out.\n"
+        "\n"
+        "Usage: nnet3-compute-batch [options] <nnet-in> <features-rspecifier> "
+        "<matrix-wspecifier>\n"
+        " e.g.: nnet3-compute-batch final.raw scp:feats.scp "
+        "ark:nnet_prediction.ark\n";
+
+    ParseOptions po(usage);
+    Timer timer;
+
+    NnetBatchComputerOptions opts;
+    opts.acoustic_scale = 1.0;  // by default do no scaling
+
+    bool apply_exp = false, use_priors = false;
+    std::string use_gpu = "yes";
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+                online_ivector_rspecifier,
+                utt2spk_rspecifier;
+    int32 online_ivector_period = 0;
+    opts.Register(&po);
+
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per "
+                "utterance by default, or per speaker if you provide the "
+                "--utt2spk option.");
+    po.Register("utt2spk", &utt2spk_rspecifier, "Rspecifier for "
+                "utt2spk option used to get ivectors per speaker");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of "
+                "frames between iVectors in matrices supplied to the "
+                "--online-ivectors option");
+    po.Register("apply-exp", &apply_exp, "If true, apply exp function to "
+                "output");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+    po.Register("use-priors", &use_priors, "If true, subtract the logs of the "
+                "priors stored with the model (in this case, "
+                "a .mdl file is expected as input).");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().AllowMultithreading();
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string nnet_rxfilename = po.GetArg(1),
+                feature_rspecifier = po.GetArg(2),
+                matrix_wspecifier = po.GetArg(3);
+
+    Nnet raw_nnet;
+    AmNnetSimple am_nnet;
+    if (use_priors) {
+      bool binary;
+      TransitionModel trans_model;
+      Input ki(nnet_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+    } else {
+      ReadKaldiObject(nnet_rxfilename, &raw_nnet);
+    }
+    Nnet &nnet = (use_priors ? am_nnet.GetNnet() : raw_nnet);
+    SetBatchnormTestMode(true, &nnet);
+    SetDropoutTestMode(true, &nnet);
+    CollapseModel(CollapseModelConfig(), &nnet);
+
+    Vector<BaseFloat> priors;
+    if (use_priors)
+      priors = am_nnet.Priors();
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    BaseFloatMatrixWriter matrix_writer(matrix_wspecifier);
+
+    int32 num_success = 0, num_fail = 0;
+    std::string output_uttid;
+    Matrix<BaseFloat> output_matrix;
+
+
+    NnetBatchInference inference(opts, nnet, priors);
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    for (; !feature_reader.Done(); feature_reader.Next()) {
+      std::string utt = feature_reader.Key();
+      const Matrix<BaseFloat> &features = feature_reader.Value();
+      if (features.NumRows() == 0) {
+        KALDI_WARN << "Zero-length utterance: " << utt;
+        num_fail++;
+        continue;
+      }
+      const Matrix<BaseFloat> *online_ivectors = NULL;
+      const Vector<BaseFloat> *ivector = NULL;
+      if (!ivector_rspecifier.empty()) {
+        if (!ivector_reader.HasKey(utt)) {
+          KALDI_WARN << "No iVector available for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          ivector = new Vector<BaseFloat>(ivector_reader.Value(utt));
+        }
+      }
+      if (!online_ivector_rspecifier.empty()) {
+        if (!online_ivector_reader.HasKey(utt)) {
+          KALDI_WARN << "No online iVector available for utterance " << utt;
+          num_fail++;
+          continue;
+        } else {
+          online_ivectors = new Matrix<BaseFloat>(
+              online_ivector_reader.Value(utt));
+        }
+      }
+
+      inference.AcceptInput(utt, features, ivector, online_ivectors,
+                            online_ivector_period);
+
+      std::string output_key;
+      Matrix<BaseFloat> output;
+      while (inference.GetOutput(&output_key, &output)) {
+        if (apply_exp)
+          output.ApplyExp();
+        matrix_writer.Write(output_key, output);
+        num_success++;
+      }
+    }
+
+    inference.Finished();
+    std::string output_key;
+    Matrix<BaseFloat> output;
+    while (inference.GetOutput(&output_key, &output)) {
+      if (apply_exp)
+        output.ApplyExp();
+      matrix_writer.Write(output_key, output);
+      num_success++;
+    }
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+    double elapsed = timer.Elapsed();
+    KALDI_LOG << "Time taken "<< elapsed << "s";
+    KALDI_LOG << "Done " << num_success << " utterances, failed for "
+              << num_fail;
+
+    if (num_success != 0) {
+      return 0;
+    } else {
+      return 1;
+    }
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-compute.cc b/src/nnet3bin/nnet3-compute.cc
index f67167bc819..45fde99a4f5 100644
--- a/src/nnet3bin/nnet3-compute.cc
+++ b/src/nnet3bin/nnet3-compute.cc
@@ -48,7 +48,7 @@ int main(int argc, char *argv[]) {
     Timer timer;
 
     NnetSimpleComputationOptions opts;
-    opts.acoustic_scale = 1.0; // by default do no scaling in this recipe.
+    opts.acoustic_scale = 1.0; // by default do no scaling.
 
     bool apply_exp = false, use_priors = false;
     std::string use_gpu = "yes";
diff --git a/src/nnet3bin/nnet3-latgen-faster-batch.cc b/src/nnet3bin/nnet3-latgen-faster-batch.cc
new file mode 100644
index 00000000000..fad2d5ed356
--- /dev/null
+++ b/src/nnet3bin/nnet3-latgen-faster-batch.cc
@@ -0,0 +1,227 @@
+// nnet3bin/nnet3-latgen-faster-parallel.cc
+
+// Copyright 2012-2016   Johns Hopkins University (author: Daniel Povey)
+//                2014   Guoguo Chen
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "base/timer.h"
+#include "base/kaldi-common.h"
+#include "decoder/decoder-wrappers.h"
+#include "fstext/fstext-lib.h"
+#include "hmm/transition-model.h"
+#include "nnet3/nnet-batch-compute.h"
+#include "nnet3/nnet-utils.h"
+#include "util/kaldi-thread.h"
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
+
+namespace kaldi {
+
+void HandleOutput(bool determinize,
+                  const fst::SymbolTable *word_syms,
+                  nnet3::NnetBatchDecoder *decoder,
+                  CompactLatticeWriter *clat_writer,
+                  LatticeWriter *lat_writer) {
+  // Write out any lattices that are ready.
+  std::string output_utterance_id, sentence;
+  if (determinize) {
+    CompactLattice clat;
+    while (decoder->GetOutput(&output_utterance_id, &clat, &sentence)) {
+      if (word_syms != NULL)
+        std::cerr << output_utterance_id << ' ' << sentence << '\n';
+      clat_writer->Write(output_utterance_id, clat);
+    }
+  } else {
+    Lattice lat;
+    while (decoder->GetOutput(&output_utterance_id, &lat, &sentence)) {
+      if (word_syms != NULL)
+        std::cerr << output_utterance_id << ' ' << sentence << '\n';
+      lat_writer->Write(output_utterance_id, lat);
+    }
+  }
+}
+
+}  // namespace kaldi
+
+int main(int argc, char *argv[]) {
+  // note: making this program work with GPUs is as simple as initializing the
+  // device, but it probably won't make a huge difference in speed for typical
+  // setups.
+  try {
+    using namespace kaldi;
+    using namespace kaldi::nnet3;
+    typedef kaldi::int32 int32;
+    using fst::SymbolTable;
+    using fst::Fst;
+    using fst::StdArc;
+
+    const char *usage =
+        "Generate lattices using nnet3 neural net model.  This version is optimized\n"
+        "for GPU-based inference.\n"
+        "Usage: nnet3-latgen-faster-parallel [options] <nnet-in> <fst-in> <features-rspecifier>"
+        " <lattice-wspecifier>\n";
+    ParseOptions po(usage);
+
+    bool allow_partial = false;
+    LatticeFasterDecoderConfig decoder_opts;
+    NnetBatchComputerOptions compute_opts;
+    std::string use_gpu = "yes";
+
+    std::string word_syms_filename;
+    std::string ivector_rspecifier,
+        online_ivector_rspecifier,
+        utt2spk_rspecifier;
+    int32 online_ivector_period = 0, num_threads = 1;
+    decoder_opts.Register(&po);
+    compute_opts.Register(&po);
+    po.Register("word-symbol-table", &word_syms_filename,
+                "Symbol table for words [for debug output]");
+    po.Register("allow-partial", &allow_partial,
+                "If true, produce output even if end state was not reached.");
+    po.Register("ivectors", &ivector_rspecifier, "Rspecifier for "
+                "iVectors as vectors (i.e. not estimated online); per utterance "
+                "by default, or per speaker if you provide the --utt2spk option.");
+    po.Register("online-ivectors", &online_ivector_rspecifier, "Rspecifier for "
+                "iVectors estimated online, as matrices.  If you supply this,"
+                " you must set the --online-ivector-period option.");
+    po.Register("online-ivector-period", &online_ivector_period, "Number of frames "
+                "between iVectors in matrices supplied to the --online-ivectors "
+                "option");
+    po.Register("num-threads", &num_threads, "Number of decoder (i.e. "
+                "graph-search) threads.  The number of model-evaluation threads "
+                "is always 1; this is optimized for use with the GPU.");
+    po.Register("use-gpu", &use_gpu,
+                "yes|no|optional|wait, only has effect if compiled with CUDA");
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 4) {
+      po.PrintUsage();
+      exit(1);
+    }
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().AllowMultithreading();
+    CuDevice::Instantiate().SelectGpuId(use_gpu);
+#endif
+
+    std::string model_in_rxfilename = po.GetArg(1),
+        fst_in_rxfilename = po.GetArg(2),
+        feature_rspecifier = po.GetArg(3),
+        lattice_wspecifier = po.GetArg(4);
+
+    TransitionModel trans_model;
+    AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(model_in_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      CollapseModel(CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    bool determinize = decoder_opts.determinize_lattice;
+    CompactLatticeWriter compact_lattice_writer;
+    LatticeWriter lattice_writer;
+    if (! (determinize ? compact_lattice_writer.Open(lattice_wspecifier)
+           : lattice_writer.Open(lattice_wspecifier)))
+      KALDI_ERR << "Could not open table for writing lattices: "
+                 << lattice_wspecifier;
+
+    RandomAccessBaseFloatMatrixReader online_ivector_reader(
+        online_ivector_rspecifier);
+    RandomAccessBaseFloatVectorReaderMapped ivector_reader(
+        ivector_rspecifier, utt2spk_rspecifier);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (word_syms_filename != "")
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                   << word_syms_filename;
+
+
+    SequentialBaseFloatMatrixReader feature_reader(feature_rspecifier);
+
+    Fst<StdArc> *decode_fst = fst::ReadFstKaldiGeneric(fst_in_rxfilename);
+
+    int32 num_success;
+    {
+      NnetBatchComputer computer(compute_opts, am_nnet.GetNnet(),
+                                 am_nnet.Priors());
+      NnetBatchDecoder decoder(*decode_fst, decoder_opts,
+                               trans_model, word_syms, allow_partial,
+                               num_threads, &computer);
+
+      for (; !feature_reader.Done(); feature_reader.Next()) {
+        std::string utt = feature_reader.Key();
+        const Matrix<BaseFloat> &features (feature_reader.Value());
+
+        if (features.NumRows() == 0) {
+          KALDI_WARN << "Zero-length utterance: " << utt;
+          decoder.UtteranceFailed();
+          continue;
+        }
+        const Matrix<BaseFloat> *online_ivectors = NULL;
+        const Vector<BaseFloat> *ivector = NULL;
+        if (!ivector_rspecifier.empty()) {
+          if (!ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No iVector available for utterance " << utt;
+            decoder.UtteranceFailed();
+            continue;
+          } else {
+            ivector = &ivector_reader.Value(utt);
+          }
+        }
+        if (!online_ivector_rspecifier.empty()) {
+          if (!online_ivector_reader.HasKey(utt)) {
+            KALDI_WARN << "No online iVector available for utterance " << utt;
+            decoder.UtteranceFailed();
+            continue;
+          } else {
+            online_ivectors = &online_ivector_reader.Value(utt);
+          }
+        }
+
+        decoder.AcceptInput(utt, features, ivector, online_ivectors,
+                            online_ivector_period);
+
+        HandleOutput(decoder_opts.determinize_lattice, word_syms, &decoder,
+                     &compact_lattice_writer, &lattice_writer);
+      }
+      num_success = decoder.Finished();
+      HandleOutput(decoder_opts.determinize_lattice, word_syms, &decoder,
+                   &compact_lattice_writer, &lattice_writer);
+
+      // At this point the decoder and batch-computer objects will print
+      // diagnostics from their destructors (they are going out of scope).
+    }
+    delete decode_fst;
+    delete word_syms;
+
+#if HAVE_CUDA==1
+    CuDevice::Instantiate().PrintProfile();
+#endif
+
+    return (num_success != 0 ? 0 : 1);
+  } catch(const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+}
diff --git a/src/nnet3bin/nnet3-latgen-faster-parallel.cc b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
index 4858a9fcb14..e3d02410368 100644
--- a/src/nnet3bin/nnet3-latgen-faster-parallel.cc
+++ b/src/nnet3bin/nnet3-latgen-faster-parallel.cc
@@ -45,9 +45,11 @@ int main(int argc, char *argv[]) {
     using fst::StdArc;
 
     const char *usage =
-        "Generate lattices using nnet3 neural net model.\n"
+        "Generate lattices using nnet3 neural net model.  This version supports\n"
+        "multiple decoding threads (using a shared decoding graph.)\n"
         "Usage: nnet3-latgen-faster-parallel [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n"
+        "See also: nnet3-latgen-faster-batch (which supports GPUs)\n";
     ParseOptions po(usage);
 
     Timer timer;
diff --git a/src/nnet3bin/nnet3-latgen-faster.cc b/src/nnet3bin/nnet3-latgen-faster.cc
index cb26745d808..42cd843cf15 100644
--- a/src/nnet3bin/nnet3-latgen-faster.cc
+++ b/src/nnet3bin/nnet3-latgen-faster.cc
@@ -33,7 +33,7 @@
 int main(int argc, char *argv[]) {
   // note: making this program work with GPUs is as simple as initializing the
   // device, but it probably won't make a huge difference in speed for typical
-  // setups.
+  // setups.  You should use nnet3-latgen-faster-batch if you want to use a GPU.
   try {
     using namespace kaldi;
     using namespace kaldi::nnet3;
@@ -45,7 +45,8 @@ int main(int argc, char *argv[]) {
     const char *usage =
         "Generate lattices using nnet3 neural net model.\n"
         "Usage: nnet3-latgen-faster [options] <nnet-in> <fst-in|fsts-rspecifier> <features-rspecifier>"
-        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n";
+        " <lattice-wspecifier> [ <words-wspecifier> [<alignments-wspecifier>] ]\n"
+        "See also: nnet3-latgen-faster-parallel, nnet3-latgen-faster-batch\n";
     ParseOptions po(usage);
     Timer timer;
     bool allow_partial = false;
diff --git a/src/nnetbin/Makefile b/src/nnetbin/Makefile
index 49a174ec36e..86d59ae503e 100644
--- a/src/nnetbin/Makefile
+++ b/src/nnetbin/Makefile
@@ -25,7 +25,6 @@ TESTFILES =
 
 ADDLIBS = ../nnet/kaldi-nnet.a ../cudamatrix/kaldi-cudamatrix.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/online/Makefile b/src/online/Makefile
index 8f2fe238111..32c99500750 100644
--- a/src/online/Makefile
+++ b/src/online/Makefile
@@ -37,8 +37,7 @@ LIBNAME = kaldi-online
 ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a 
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
 
diff --git a/src/online2/Makefile b/src/online2/Makefile
index 764fef3ab26..242c7be6da6 100644
--- a/src/online2/Makefile
+++ b/src/online2/Makefile
@@ -18,8 +18,8 @@ ADDLIBS = ../ivector/kaldi-ivector.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 
 
diff --git a/src/online2/online-ivector-feature.cc b/src/online2/online-ivector-feature.cc
index 4e64609d9ff..3356eb4b1c7 100644
--- a/src/online2/online-ivector-feature.cc
+++ b/src/online2/online-ivector-feature.cc
@@ -174,24 +174,54 @@ void OnlineIvectorFeature::UpdateFrameWeights(
   delta_weights_provided_ = true;
 }
 
-void OnlineIvectorFeature::UpdateStatsForFrame(int32 t,
-                                               BaseFloat weight) {
+
+BaseFloat OnlineIvectorFeature::GetMinPost(BaseFloat weight) const {
+  BaseFloat min_post = info_.min_post;
+  BaseFloat abs_weight = fabs(weight);
+  // If we return 0.99, it will have the same effect as just picking the
+  // most probable Gaussian on that frame.
+  if (abs_weight == 0.0)
+    return 0.99;   // I don't anticipate reaching here.
+  min_post /= abs_weight;
+  if (min_post > 0.99)
+    min_post = 0.99;
+  return min_post;
+}
+
+void OnlineIvectorFeature::UpdateStatsForFrames(
+    const std::vector<std::pair<int32, BaseFloat> > &frame_weights) {
+  int32 num_frames = static_cast<int32>(frame_weights.size());
   int32 feat_dim = lda_normalized_->Dim();
-  Vector<BaseFloat> feat(feat_dim),  // features given to iVector extractor
-      log_likes(info_.diag_ubm.NumGauss());
-  lda_normalized_->GetFrame(t, &feat);
-  info_.diag_ubm.LogLikelihoods(feat, &log_likes);
-  // "posterior" stores the pruned posteriors for Gaussians in the UBM.
-  std::vector<std::pair<int32, BaseFloat> > posterior;
-  tot_ubm_loglike_ += weight *
-      VectorToPosteriorEntry(log_likes, info_.num_gselect,
-                             info_.min_post, &posterior);
-  for (size_t i = 0; i < posterior.size(); i++)
-    posterior[i].second *= info_.posterior_scale * weight;
-  lda_->GetFrame(t, &feat); // get feature without CMN.
-  ivector_stats_.AccStats(info_.extractor, feat, posterior);
+  Matrix<BaseFloat> feats(num_frames, feat_dim, kUndefined),
+      log_likes;
+
+  std::vector<int32> frames;
+  frames.reserve(frame_weights.size());
+  for (int32 i = 0; i < num_frames; i++)
+    frames.push_back(frame_weights[i].first);
+  lda_normalized_->GetFrames(frames, &feats);
+
+  info_.diag_ubm.LogLikelihoods(feats, &log_likes);
+
+  // "posteriors" stores, for each frame index in the range of frames, the
+  // pruned posteriors for the Gaussians in the UBM.
+  std::vector<std::vector<std::pair<int32, BaseFloat> > > posteriors(num_frames);
+  for (int32 i = 0; i < num_frames; i++) {
+    std::vector<std::pair<int32, BaseFloat> > &posterior = posteriors[i];
+    BaseFloat weight = frame_weights[i].second;
+    if (weight != 0.0) {
+      tot_ubm_loglike_ += weight *
+          VectorToPosteriorEntry(log_likes.Row(i), info_.num_gselect,
+                                 GetMinPost(weight), &posterior);
+      for (size_t j = 0; j < posterior.size(); j++)
+        posterior[j].second *= info_.posterior_scale * weight;
+    }
+  }
+  lda_->GetFrames(frames, &feats);  // get features without CMN.
+  ivector_stats_.AccStats(info_.extractor, feats, posteriors);
 }
 
+
 void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
   KALDI_ASSERT(frame >= 0 && frame < this->NumFramesReady() &&
                !delta_weights_provided_);
@@ -200,11 +230,19 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
   int32 ivector_period = info_.ivector_period;
   int32 num_cg_iters = info_.num_cg_iters;
 
+  std::vector<std::pair<int32, BaseFloat> > frame_weights;
+
   for (; num_frames_stats_ <= frame; num_frames_stats_++) {
     int32 t = num_frames_stats_;
-    UpdateStatsForFrame(t, 1.0);
+    BaseFloat frame_weight = 1.0;
+    frame_weights.push_back(std::pair<int32, BaseFloat>(t, frame_weight));
     if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
         (info_.use_most_recent_ivector && t == frame)) {
+      // The call below to UpdateStatsForFrames() is equivalent to doing, for
+      // all valid indexes i:
+      //  UpdateStatsForFrame(cur_start_frame + i, frame_weights[i])
+      UpdateStatsForFrames(frame_weights);
+      frame_weights.clear();
       ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
       if (!info_.use_most_recent_ivector) {  // need to cache iVectors.
         int32 ivec_index = t / ivector_period;
@@ -213,6 +251,8 @@ void OnlineIvectorFeature::UpdateStatsUntilFrame(int32 frame) {
       }
     }
   }
+  if (!frame_weights.empty())
+    UpdateStatsForFrames(frame_weights);
 }
 
 void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
@@ -225,17 +265,19 @@ void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
   int32 ivector_period = info_.ivector_period;
   int32 num_cg_iters = info_.num_cg_iters;
 
+  std::vector<std::pair<int32, BaseFloat> > frame_weights;
+  frame_weights.reserve(delta_weights_.size());
+
   for (; num_frames_stats_ <= frame; num_frames_stats_++) {
     int32 t = num_frames_stats_;
     // Instead of just updating frame t, we update all frames that need updating
-    // with index <= 1, in case old frames were reclassified as silence/nonsilence.
+    // with index <= t, in case old frames were reclassified as silence/nonsilence.
     while (!delta_weights_.empty() &&
            delta_weights_.top().first <= t) {
-      std::pair<int32, BaseFloat> p = delta_weights_.top();
+      int32 frame = delta_weights_.top().first;
+      BaseFloat weight = delta_weights_.top().second;
+      frame_weights.push_back(delta_weights_.top());
       delta_weights_.pop();
-      int32 frame = p.first;
-      BaseFloat weight = p.second;
-      UpdateStatsForFrame(frame, weight);
       if (debug_weights) {
         if (current_frame_weight_debug_.size() <= frame)
           current_frame_weight_debug_.resize(frame + 1, 0.0);
@@ -244,6 +286,8 @@ void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
     }
     if ((!info_.use_most_recent_ivector && t % ivector_period == 0) ||
         (info_.use_most_recent_ivector && t == frame)) {
+      UpdateStatsForFrames(frame_weights);
+      frame_weights.clear();
       ivector_stats_.GetIvector(num_cg_iters, &current_ivector_);
       if (!info_.use_most_recent_ivector) {  // need to cache iVectors.
         int32 ivec_index = t / ivector_period;
@@ -252,6 +296,8 @@ void OnlineIvectorFeature::UpdateStatsUntilFrameWeighted(int32 frame) {
       }
     }
   }
+  if (!frame_weights.empty())
+    UpdateStatsForFrames(frame_weights);
 }
 
 
@@ -297,7 +343,7 @@ void OnlineIvectorFeature::PrintDiagnostics() const {
     Vector<BaseFloat> temp_ivector(current_ivector_);
     temp_ivector(0) -= info_.extractor.PriorOffset();
 
-    KALDI_VLOG(3) << "By the end of the utterance, objf change/frame "
+    KALDI_VLOG(2) << "By the end of the utterance, objf change/frame "
                   << "from estimating iVector (vs. default) was "
                   << ivector_stats_.ObjfChange(current_ivector_)
                   << " and iVector length was "
@@ -308,12 +354,8 @@ void OnlineIvectorFeature::PrintDiagnostics() const {
 OnlineIvectorFeature::~OnlineIvectorFeature() {
   PrintDiagnostics();
   // Delete objects owned here.
-  delete lda_normalized_;
-  delete splice_normalized_;
-  delete cmvn_;
-  delete lda_;
-  delete splice_;
-  // base_ is not owned here so don't delete it.
+  for (size_t i = 0; i < to_delete_.size(); i++)
+    delete to_delete_[i];
   for (size_t i = 0; i < ivectors_history_.size(); i++)
     delete ivectors_history_[i];
 }
@@ -334,7 +376,8 @@ void OnlineIvectorFeature::GetAdaptationState(
 OnlineIvectorFeature::OnlineIvectorFeature(
     const OnlineIvectorExtractionInfo &info,
     OnlineFeatureInterface *base_feature):
-    info_(info), base_(base_feature),
+    info_(info),
+    base_(base_feature),
     ivector_stats_(info_.extractor.IvectorDim(),
                    info_.extractor.PriorOffset(),
                    info_.max_count),
@@ -343,16 +386,33 @@ OnlineIvectorFeature::OnlineIvectorFeature(
     most_recent_frame_with_weight_(-1), tot_ubm_loglike_(0.0) {
   info.Check();
   KALDI_ASSERT(base_feature != NULL);
-  splice_ = new OnlineSpliceFrames(info_.splice_opts, base_);
-  lda_ = new OnlineTransform(info.lda_mat, splice_);
+  OnlineFeatureInterface *splice_feature = new OnlineSpliceFrames(info_.splice_opts, base_feature);
+  to_delete_.push_back(splice_feature);
+  OnlineFeatureInterface *lda_feature = new OnlineTransform(info.lda_mat, splice_feature);
+  to_delete_.push_back(lda_feature);
+  OnlineFeatureInterface *lda_cache_feature = new OnlineCacheFeature(lda_feature);
+  lda_ = lda_cache_feature;
+  to_delete_.push_back(lda_cache_feature);
+
+
   OnlineCmvnState naive_cmvn_state(info.global_cmvn_stats);
   // Note: when you call this constructor the CMVN state knows nothing
   // about the speaker.  If you want to inform this class about more specific
   // adaptation state, call this->SetAdaptationState(), most likely derived
   // from a call to GetAdaptationState() from a previous object of this type.
-  cmvn_ = new OnlineCmvn(info.cmvn_opts, naive_cmvn_state, base_);
-  splice_normalized_ = new OnlineSpliceFrames(info_.splice_opts, cmvn_);
-  lda_normalized_ = new OnlineTransform(info.lda_mat, splice_normalized_);
+  cmvn_ = new OnlineCmvn(info.cmvn_opts, naive_cmvn_state, base_feature);
+  to_delete_.push_back(cmvn_);
+
+  OnlineFeatureInterface *splice_normalized =
+      new OnlineSpliceFrames(info_.splice_opts, cmvn_),
+      *lda_normalized =
+      new OnlineTransform(info.lda_mat, splice_normalized),
+      *cache_normalized = new OnlineCacheFeature(lda_normalized);
+  lda_normalized_ = cache_normalized;
+
+  to_delete_.push_back(splice_normalized);
+  to_delete_.push_back(lda_normalized);
+  to_delete_.push_back(cache_normalized);
 
   // Set the iVector to its default value, [ prior_offset, 0, 0, ... ].
   current_ivector_.Resize(info_.extractor.IvectorDim());
diff --git a/src/online2/online-ivector-feature.h b/src/online2/online-ivector-feature.h
index d4a89fdc8d1..25e078f1a98 100644
--- a/src/online2/online-ivector-feature.h
+++ b/src/online2/online-ivector-feature.h
@@ -311,9 +311,19 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
       const std::vector<std::pair<int32, BaseFloat> > &delta_weights);
 
  private:
-  // this function adds "weight" to the stats for frame "frame".
-  void UpdateStatsForFrame(int32 frame,
-                           BaseFloat weight);
+
+  // This accumulates i-vector stats for a set of frames, specified as pairs
+  // (t, weight).  The weights do not have to be positive.  (In the online
+  // silence-weighting that we do, negative weights can occur if we change our
+  // minds about the assignment of a frame as silence vs. non-silence).
+  void UpdateStatsForFrames(
+      const std::vector<std::pair<int32, BaseFloat> > &frame_weights);
+
+  // Returns a modified version of info_.min_post, which is opts_.min_post if
+  // weight is 1.0 or -1.0, but gets larger if fabs(weight) is small... but no
+  // larger than 0.99.  (This is an efficiency thing, to not bother processing
+  // very small counts).
+  BaseFloat GetMinPost(BaseFloat weight) const;
 
   // This is the original UpdateStatsUntilFrame that is called when there is
   // no data-weighting involved.
@@ -327,14 +337,16 @@ class OnlineIvectorFeature: public OnlineFeatureInterface {
 
   const OnlineIvectorExtractionInfo &info_;
 
-  // base_ is the base feature; it is not owned here.
-  OnlineFeatureInterface *base_;
-  // the following online-feature-extractor pointers are owned here:
-  OnlineSpliceFrames *splice_; // splice on top of raw features.
-  OnlineTransform *lda_;  // LDA on top of raw+splice features.
-  OnlineCmvn *cmvn_;
-  OnlineSpliceFrames *splice_normalized_; // splice on top of CMVN feats.
-  OnlineTransform *lda_normalized_;  // LDA on top of CMVN+splice
+  OnlineFeatureInterface *base_;  // The feature this is built on top of
+                                  // (e.g. MFCC); not owned here
+
+  OnlineFeatureInterface *lda_;  // LDA on top of raw+splice features.
+  OnlineCmvn *cmvn_;  // the CMVN that we give to the lda_normalized_.
+  OnlineFeatureInterface *lda_normalized_;  // LDA on top of CMVN+splice
+
+  // the following is the pointers to OnlineFeatureInterface objects that are
+  // owned here and which we need to delete.
+  std::vector<OnlineFeatureInterface*> to_delete_;
 
   /// the iVector estimation stats
   OnlineIvectorEstimationStats ivector_stats_;
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 2731fbfae1d..8792cc5b11a 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -23,6 +23,5 @@ ADDLIBS = ../online2/kaldi-online2.a ../ivector/kaldi-ivector.a \
           ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 include ../makefiles/default_rules.mk
diff --git a/src/online2bin/ivector-extract-online2.cc b/src/online2bin/ivector-extract-online2.cc
index 33aa990d1c3..e30d78620ad 100644
--- a/src/online2bin/ivector-extract-online2.cc
+++ b/src/online2bin/ivector-extract-online2.cc
@@ -23,6 +23,7 @@
 #include "gmm/am-diag-gmm.h"
 #include "online2/online-ivector-feature.h"
 #include "util/kaldi-thread.h"
+#include "base/timer.h"
 
 int main(int argc, char *argv[]) {
   using namespace kaldi;
@@ -47,9 +48,9 @@ int main(int argc, char *argv[]) {
         "e.g.: \n"
         "  ivector-extract-online2 --config=exp/nnet2_online/nnet_online/conf/ivector_extractor.conf \\\n"
         "    ark:data/train/spk2utt scp:data/train/feats.scp ark,t:ivectors.1.ark\n";
-    
+
     ParseOptions po(usage);
-    
+
     OnlineIvectorExtractionConfig ivector_config;
     ivector_config.Register(&po);
 
@@ -57,7 +58,7 @@ int main(int argc, char *argv[]) {
     bool repeat = false;
     int32 length_tolerance = 0;
     std::string frame_weights_rspecifier;
-    
+
     po.Register("num-threads", &g_num_threads,
                 "Number of threads to use for computing derived variables "
                 "of iVector extractor, at process start-up.");
@@ -71,29 +72,28 @@ int main(int argc, char *argv[]) {
                 "for feats and frame weights");
 
     po.Read(argc, argv);
-    
+
     if (po.NumArgs() != 3) {
       po.PrintUsage();
       exit(1);
     }
-    
+
     std::string spk2utt_rspecifier = po.GetArg(1),
         feature_rspecifier = po.GetArg(2),
         ivectors_wspecifier = po.GetArg(3);
-    
+
     double tot_ubm_loglike = 0.0, tot_objf_impr = 0.0, tot_t = 0.0,
         tot_length = 0.0, tot_length_utt_end = 0.0;
     int32 num_done = 0, num_err = 0;
-    
+
     ivector_config.use_most_recent_ivector = false;
     OnlineIvectorExtractionInfo ivector_info(ivector_config);
-    
+
     SequentialTokenVectorReader spk2utt_reader(spk2utt_rspecifier);
     RandomAccessBaseFloatMatrixReader feature_reader(feature_rspecifier);
     RandomAccessBaseFloatVectorReader frame_weights_reader(frame_weights_rspecifier);
     BaseFloatMatrixWriter ivector_writer(ivectors_wspecifier);
-    
-    
+
     for (; !spk2utt_reader.Done(); spk2utt_reader.Next()) {
       std::string spk = spk2utt_reader.Key();
       const std::vector<std::string> &uttlist = spk2utt_reader.Value();
@@ -107,12 +107,12 @@ int main(int argc, char *argv[]) {
           continue;
         }
         const Matrix<BaseFloat> &feats = feature_reader.Value(utt);
-        
+
         OnlineMatrixFeature matrix_feature(feats);
 
         OnlineIvectorFeature ivector_feature(ivector_info,
                                              &matrix_feature);
-        
+
         ivector_feature.SetAdaptationState(adaptation_state);
 
         if (!frame_weights_rspecifier.empty()) {
@@ -143,10 +143,10 @@ int main(int argc, char *argv[]) {
         int32 T = feats.NumRows(),
             n = (repeat ? 1 : ivector_config.ivector_period),
             num_ivectors = (T + n - 1) / n;
-        
+
         Matrix<BaseFloat> ivectors(num_ivectors,
                                    ivector_feature.Dim());
-        
+
         for (int32 i = 0; i < num_ivectors; i++) {
           int32 t = i * n;
           SubVector<BaseFloat> ivector(ivectors, i);
diff --git a/src/onlinebin/Makefile b/src/onlinebin/Makefile
index 0999f4e7792..7c0550d0848 100644
--- a/src/onlinebin/Makefile
+++ b/src/onlinebin/Makefile
@@ -39,7 +39,7 @@ TESTFILES =
 ADDLIBS = ../online/kaldi-online.a ../decoder/kaldi-decoder.a \
           ../lat/kaldi-lat.a ../hmm/kaldi-hmm.a ../feat/kaldi-feat.a \
           ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlm/Makefile b/src/rnnlm/Makefile
index 6ee52bbb1d7..d4b3f3ce0a8 100644
--- a/src/rnnlm/Makefile
+++ b/src/rnnlm/Makefile
@@ -15,7 +15,7 @@ OBJFILES = sampler.o rnnlm-example.o rnnlm-example-utils.o \
 LIBNAME = kaldi-rnnlm
 
 ADDLIBS = ../nnet3/kaldi-nnet3.a ../cudamatrix/kaldi-cudamatrix.a \
-          ../util/kaldi-util.a ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-          ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a
+          ../lm/kaldi-lm.a ../hmm/kaldi-hmm.a ../util/kaldi-util.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/rnnlm/rnnlm-example-utils.cc b/src/rnnlm/rnnlm-example-utils.cc
index fd7cca5eadb..5aa2465d24d 100644
--- a/src/rnnlm/rnnlm-example-utils.cc
+++ b/src/rnnlm/rnnlm-example-utils.cc
@@ -284,11 +284,15 @@ static void ProcessRnnlmOutputNoSampling(
     CuMatrix<BaseFloat> word_probs(nnet_output.NumRows(),
                                    num_words - 1, kUndefined);
     word_probs.CopyFromMat(word_logprobs.ColRange(1, num_words - 1));
-    word_probs.ApplyExp();
+    word_probs.ApplyExpLimited(-80.0, 80.0);
     CuVector<BaseFloat> row_sums(nnet_output.NumRows());
     row_sums.AddColSumMat(1.0, word_probs, 0.0);
     row_sums.ApplyLog();
-    *objf_den_exact = -VecVec(row_sums, minibatch.output_weights);
+    BaseFloat ans = -VecVec(row_sums, minibatch.output_weights);
+    *objf_den_exact =  ans;
+    if (fabs(ans) > 100) {
+      KALDI_WARN << "Big den objf "  << ans;
+    }
   }
 
   // In preparation for computing the denominator objf, change 'word_logprobs'
diff --git a/src/rnnlm/rnnlm-example.cc b/src/rnnlm/rnnlm-example.cc
index 0be4d4ecb47..8dd36689fd6 100644
--- a/src/rnnlm/rnnlm-example.cc
+++ b/src/rnnlm/rnnlm-example.cc
@@ -346,7 +346,7 @@ RnnlmExampleCreator::~RnnlmExampleCreator() {
       num_minibatches_written_;
   KALDI_LOG << "Combined " << num_sequences_processed_ << "/"
             << num_chunks_processed_
-            << " chunks/sequences into " << num_minibatches_written_
+            << " sequences/chunks into " << num_minibatches_written_
             << " minibatches (" << chunks_.size()
             << " chunks left over)";
  KALDI_LOG << "Overall there were "
diff --git a/src/rnnlm/rnnlm-example.h b/src/rnnlm/rnnlm-example.h
index 1f3bcb957a9..3ac92701e36 100644
--- a/src/rnnlm/rnnlm-example.h
+++ b/src/rnnlm/rnnlm-example.h
@@ -401,7 +401,7 @@ class RnnlmExampleCreator {
                       TableWriter<KaldiObjectHolder<RnnlmExample> > *writer):
       config_(config), minibatch_sampler_(NULL),
       sampling_sequencer_(TaskSequencerConfig()),
-      writer_(writer),
+      writer_(writer), num_sequences_processed_(0),
       num_chunks_processed_(0), num_words_processed_(0),
       num_minibatches_written_(0) { Check(); }
 
diff --git a/src/rnnlmbin/Makefile b/src/rnnlmbin/Makefile
index 4c4231c02c8..23a8eba6145 100644
--- a/src/rnnlmbin/Makefile
+++ b/src/rnnlmbin/Makefile
@@ -16,11 +16,11 @@ cuda-compiled.o: ../kaldi.mk
 
 TESTFILES =
 
-ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../lm/kaldi-lm.a ../nnet3/kaldi-nnet3.a \
+ADDLIBS = ../rnnlm/kaldi-rnnlm.a ../nnet3/kaldi-nnet3.a \
           ../cudamatrix/kaldi-cudamatrix.a ../decoder/kaldi-decoder.a \
-          ../lat/kaldi-lat.a ../fstext/kaldi-fstext.a ../hmm/kaldi-hmm.a \
-          ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
+          ../hmm/kaldi-hmm.a ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/Makefile b/src/sgmm2/Makefile
index d538c14c1a9..35a8d3a1f40 100644
--- a/src/sgmm2/Makefile
+++ b/src/sgmm2/Makefile
@@ -13,7 +13,7 @@ OBJFILES = am-sgmm2.o estimate-am-sgmm2.o estimate-am-sgmm2-ebw.o fmllr-sgmm2.o
 LIBNAME = kaldi-sgmm2
 
 ADDLIBS = ../transform/kaldi-transform.a ../gmm/kaldi-gmm.a \
-          ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+          ../tree/kaldi-tree.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/sgmm2/decodable-am-sgmm2.h b/src/sgmm2/decodable-am-sgmm2.h
index 75144650568..18498bf5b24 100644
--- a/src/sgmm2/decodable-am-sgmm2.h
+++ b/src/sgmm2/decodable-am-sgmm2.h
@@ -59,15 +59,15 @@ class DecodableAmSgmm2 : public DecodableInterface {
       sgmm_cache_(sgmm.NumGroups(), sgmm.NumPdfs()), delete_vars_(true) {
     KALDI_ASSERT(gselect->size() == static_cast<size_t>(feats->NumRows()));
   }
-  
+
   // Note, frames are numbered from zero, but transition indices are 1-based!
   // This is for compatibility with OpenFST.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdf(tid));
+    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid));
   }
   int32 NumFramesReady() const { return feature_matrix_->NumRows(); }
   virtual int32 NumIndices() const { return trans_model_.NumTransitionIds(); }
-  
+
   virtual bool IsLastFrame(int32 frame) const {
     KALDI_ASSERT(frame < NumFramesReady());
     return (frame == NumFramesReady() - 1);
@@ -81,17 +81,17 @@ class DecodableAmSgmm2 : public DecodableInterface {
   Sgmm2PerSpkDerivedVars *spk_;
   const TransitionModel &trans_model_;  ///< for tid to pdf mapping
   const Matrix<BaseFloat> *feature_matrix_;
-  const std::vector<std::vector<int32> > *gselect_; 
-  
+  const std::vector<std::vector<int32> > *gselect_;
+
   BaseFloat log_prune_;
-  
+
   int32 cur_frame_;
   Sgmm2PerFrameDerivedVars per_frame_vars_;
   Sgmm2LikelihoodCache sgmm_cache_;
 
   bool delete_vars_; // If true, we will delete feature_matrix_, gselect_, and
   // spk_ in the destructor.
-  
+
  private:
   KALDI_DISALLOW_COPY_AND_ASSIGN(DecodableAmSgmm2);
 };
@@ -121,10 +121,10 @@ class DecodableAmSgmm2Scaled : public DecodableAmSgmm2 {
       : DecodableAmSgmm2(sgmm, tm, feats, gselect, spk, log_prune),
         scale_(scale) {}
 
-  
+
   // Note, frames are numbered from zero but transition-ids from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
-    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdf(tid))
+    return LogLikelihoodForPdf(frame, trans_model_.TransitionIdToPdfFast(tid))
             * scale_;
   }
  private:
diff --git a/src/sgmm2bin/Makefile b/src/sgmm2bin/Makefile
index 34407a4f5ad..e973061ed8a 100644
--- a/src/sgmm2bin/Makefile
+++ b/src/sgmm2bin/Makefile
@@ -21,7 +21,6 @@ ADDLIBS = ../decoder/kaldi-decoder.a ../lat/kaldi-lat.a \
           ../fstext/kaldi-fstext.a ../sgmm2/kaldi-sgmm2.a ../hmm/kaldi-hmm.a \
           ../feat/kaldi-feat.a ../transform/kaldi-transform.a \
           ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/tfrnnlm/Makefile b/src/tfrnnlm/Makefile
index 12e6c9494c9..db2b840b959 100644
--- a/src/tfrnnlm/Makefile
+++ b/src/tfrnnlm/Makefile
@@ -28,9 +28,8 @@ TESTFILES =
 
 LIBNAME = kaldi-tensorflow-rnnlm
 
-ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
-
+ADDLIBS = ../lm/kaldi-lm.a ../util/kaldi-util.a ../matrix/kaldi-matrix.a \
+          ../base/kaldi-base.a 
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
 
diff --git a/src/tfrnnlmbin/Makefile b/src/tfrnnlmbin/Makefile
index f2a353c918c..4beeeb0d594 100644
--- a/src/tfrnnlmbin/Makefile
+++ b/src/tfrnnlmbin/Makefile
@@ -29,8 +29,8 @@ TESTFILES =
 
 ADDLIBS = ../lat/kaldi-lat.a ../lm/kaldi-lm.a ../fstext/kaldi-fstext.a \
           ../hmm/kaldi-hmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a \
-          ../base/kaldi-base.a ../tfrnnlm/kaldi-tensorflow-rnnlm.a
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a \
+          ../tfrnnlm/kaldi-tensorflow-rnnlm.a 
 
 LDLIBS +=  -lz -ldl -fPIC -lrt
 LDLIBS += -L$(TENSORFLOW)/bazel-bin/tensorflow -ltensorflow_cc -ltensorflow_framework
diff --git a/src/transform/Makefile b/src/transform/Makefile
index 02f5d0ec396..a265db6ac37 100644
--- a/src/transform/Makefile
+++ b/src/transform/Makefile
@@ -14,8 +14,7 @@ OBJFILES = regression-tree.o regtree-mllr-diag-gmm.o lda-estimate.o \
 
 LIBNAME = kaldi-transform
 
-ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a \
-          ../util/kaldi-util.a \
-          ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../gmm/kaldi-gmm.a ../tree/kaldi-tree.a ../util/kaldi-util.a \
+          ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/src/transform/cmvn.cc b/src/transform/cmvn.cc
index 8dfe016227a..76f6652eecd 100644
--- a/src/transform/cmvn.cc
+++ b/src/transform/cmvn.cc
@@ -74,41 +74,43 @@ void ApplyCmvn(const MatrixBase<double> &stats,
   if (stats.NumRows() == 1 && var_norm)
     KALDI_ERR << "You requested variance normalization but no variance stats "
               << "are supplied.";
-  
+
   double count = stats(0, dim);
   // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
   // computing an offset and representing it as stats, we use a count of one.
   if (count < 1.0)
     KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
               << "count = " << count;
-  
-  Matrix<BaseFloat> norm(2, dim);  // norm(0, d) = mean offset
+
+  if (!var_norm) {
+    Vector<BaseFloat> offset(dim);
+    SubVector<double> mean_stats(stats.RowData(0), dim);
+    offset.AddVec(-1.0 / count, mean_stats);
+    feats->AddVecToRows(1.0, offset);
+    return;
+  }
+  // norm(0, d) = mean offset;
   // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
+  Matrix<BaseFloat> norm(2, dim);
   for (int32 d = 0; d < dim; d++) {
     double mean, offset, scale;
     mean = stats(0, d)/count;
-    if (!var_norm) {
-      scale = 1.0;
-      offset = -mean;
-    } else {
-      double var = (stats(1, d)/count) - mean*mean,
-          floor = 1.0e-20;
-      if (var < floor) {
-        KALDI_WARN << "Flooring cepstral variance from " << var << " to "
-                   << floor;
-        var = floor;
-      }
-      scale = 1.0 / sqrt(var);
-      if (scale != scale || 1/scale == 0.0)
-        KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
-      offset = -(mean*scale);
+    double var = (stats(1, d)/count) - mean*mean,
+        floor = 1.0e-20;
+    if (var < floor) {
+      KALDI_WARN << "Flooring cepstral variance from " << var << " to "
+                 << floor;
+      var = floor;
     }
+    scale = 1.0 / sqrt(var);
+    if (scale != scale || 1/scale == 0.0)
+      KALDI_ERR << "NaN or infinity in cepstral mean/variance computation";
+    offset = -(mean*scale);
     norm(0, d) = offset;
     norm(1, d) = scale;
   }
   // Apply the normalization.
-  if (var_norm)
-    feats->MulColsVec(norm.Row(1));
+  feats->MulColsVec(norm.Row(1));
   feats->AddVecToRows(1.0, norm.Row(0));
 }
 
@@ -125,14 +127,14 @@ void ApplyCmvnReverse(const MatrixBase<double> &stats,
   if (stats.NumRows() == 1 && var_norm)
     KALDI_ERR << "You requested variance normalization but no variance stats "
               << "are supplied.";
-  
+
   double count = stats(0, dim);
   // Do not change the threshold of 1.0 here: in the balanced-cmvn code, when
   // computing an offset and representing it as stats, we use a count of one.
   if (count < 1.0)
     KALDI_ERR << "Insufficient stats for cepstral mean and variance normalization: "
               << "count = " << count;
-  
+
   Matrix<BaseFloat> norm(2, dim);  // norm(0, d) = mean offset
   // norm(1, d) = scale, e.g. x(d) <-- x(d)*norm(1, d) + norm(0, d).
   for (int32 d = 0; d < dim; d++) {
diff --git a/src/transform/decodable-am-diag-gmm-regtree.h b/src/transform/decodable-am-diag-gmm-regtree.h
index 9da4b7f1591..b6e7888ffdc 100644
--- a/src/transform/decodable-am-diag-gmm-regtree.h
+++ b/src/transform/decodable-am-diag-gmm-regtree.h
@@ -51,7 +51,7 @@ class DecodableAmDiagGmmRegtreeFmllr: public DecodableAmDiagGmmUnmapped {
   // Note, frames are numbered from zero but transition-ids (tid) from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
     return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdf(tid));
+                                         trans_model_.TransitionIdToPdfFast(tid));
   }
 
   virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
@@ -94,7 +94,7 @@ class DecodableAmDiagGmmRegtreeMllr: public DecodableAmDiagGmmUnmapped {
   // Note, frames are numbered from zero but transition-ids (tid) from one.
   virtual BaseFloat LogLikelihood(int32 frame, int32 tid) {
     return scale_*LogLikelihoodZeroBased(frame,
-                                         trans_model_.TransitionIdToPdf(tid));
+                                         trans_model_.TransitionIdToPdfFast(tid));
   }
 
   virtual int32 NumFramesReady() const { return feature_matrix_.NumRows(); }
diff --git a/src/util/Makefile b/src/util/Makefile
index 80c57fd7435..acfab8b8de1 100644
--- a/src/util/Makefile
+++ b/src/util/Makefile
@@ -15,6 +15,6 @@ OBJFILES = text-utils.o kaldi-io.o kaldi-holder.o kaldi-table.o \
 
 LIBNAME = kaldi-util
 
-ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a
+ADDLIBS = ../matrix/kaldi-matrix.a ../base/kaldi-base.a 
 
 include ../makefiles/default_rules.mk
diff --git a/tools/extras/install_irstlm.sh b/tools/extras/install_irstlm.sh
index b27f0f89897..7c88377ad87 100755
--- a/tools/extras/install_irstlm.sh
+++ b/tools/extras/install_irstlm.sh
@@ -11,6 +11,13 @@ errcho() { echo "$@" 1>&2; }
 
 errcho "****() Installing IRSTLM"
 
+if [ ! -d ./extras ]; then
+  errcho "****** You are trying to install IRSTLM from the wrong directory.  You should"
+  errcho "****** go to tools/ and type extras/install_irstlm.sh."
+  exit 1
+fi
+
+
 if [ ! -d ./irstlm ] ; then
   svn=`which git`
   if [ $? != 0 ]  ; then