diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
index c7e50ea165e..7a16bdcdb12 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances.sh
@@ -174,10 +174,17 @@ if [ $stage -le 3 ]; then
   cp $srcdir/phones.txt $dir 2>/dev/null || true
 
   mkdir -p $graph_dir
+  
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
 
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
-    --nj $nj --cmd "$cmd" $text \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
diff --git a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
index 751200bdf83..f0df1e7730c 100755
--- a/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
+++ b/egs/wsj/s5/steps/cleanup/segment_long_utterances_nnet3.sh
@@ -235,10 +235,17 @@ if [ $stage -le 3 ]; then
 
   mkdir -p $graph_dir
 
+  n_reco=$(cat $text | wc -l) || exit 1
+  nj_reco=$nj
+
+  if [ $nj -gt $n_reco ]; then
+    nj_reco=$n_reco
+  fi
+
   # Make graphs w.r.t. to the original text (usually recording-level)
   steps/cleanup/make_biased_lm_graphs.sh $graph_opts \
     --scale-opts "$scale_opts" \
-    --nj $nj --cmd "$cmd" $text \
+    --nj $nj_reco --cmd "$cmd" $text \
     $lang $dir $dir/graphs
   if [ -z "$utt2text" ]; then
     # and then copy it to the sub-segments.
diff --git a/egs/wsj/s5/utils/parse_options.sh b/egs/wsj/s5/utils/parse_options.sh
index 34476fdb37a..335e69e9ac7 100755
--- a/egs/wsj/s5/utils/parse_options.sh
+++ b/egs/wsj/s5/utils/parse_options.sh
@@ -42,7 +42,7 @@ done
 
 
 ###
-### No we process the command line options
+### Now we process the command line options
 ###
 while true; do
   [ -z "${1:-}" ] && break;  # break if there are no arguments
diff --git a/src/cudamatrix/cu-kernels.cu b/src/cudamatrix/cu-kernels.cu
index 17d56a05772..515412ca398 100644
--- a/src/cudamatrix/cu-kernels.cu
+++ b/src/cudamatrix/cu-kernels.cu
@@ -2552,9 +2552,12 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   const int i = blockIdx.x;
   const int tid = threadIdx.x;
   const Real* x_row = x + i * x_d.stride;
+
   typedef cub::BlockReduce<Real, CU1DBLOCK> BlockReduceT;
   __shared__ typename BlockReduceT::TempStorage temp_storage;
-  __shared__ Real ssum[CU1DBLOCK];
+
+  __shared__ Real stddev_div_target_rms;
+  __shared__ Real scale;
 
   // Reduce x_j^2 to CU1DBLOCK elements per row
   Real tsum = Real(0);
@@ -2563,14 +2566,14 @@ static void _normalize_per_row(Real *y, int y_stride, const Real *x,
   }
   tsum = BlockReduceT(temp_storage).Sum(tsum);
   __syncthreads();
-  
 
-  const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
-  ssum[tid] = sqrt(
-    fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
-
-  const Real stddev_div_target_rms = ssum[0];
-  const Real scale = Real(1) / stddev_div_target_rms;
+  if (tid == 0) {
+    const Real kSquaredNormFloor = 1.3552527156068805425e-20; // 2^-66
+    stddev_div_target_rms = sqrt(
+      fmax(tsum / (target_rms * target_rms * x_d.cols), kSquaredNormFloor));
+    scale = Real(1) / stddev_div_target_rms;
+  }
+  __syncthreads();
 
   // Store normalized input to output
   Real* y_row = y + i * y_stride;
diff --git a/src/cudamatrix/cu-math-test.cc b/src/cudamatrix/cu-math-test.cc
index 09255c9587b..022742ed29f 100644
--- a/src/cudamatrix/cu-math-test.cc
+++ b/src/cudamatrix/cu-math-test.cc
@@ -545,6 +545,50 @@ static void UnitTestCuMathNormalizePerRow() {
   }
 }
 
+
+template<typename Real>
+static void UnitTestCuMathNormalizePerRow_v2() {
+
+  int row = 128;
+  int col = 1024;
+
+  Matrix<Real> Hi(row,col);
+  Matrix<Real> Ho(row,col);
+  Hi.SetRandn();
+  Hi.Scale(5.0);
+  Hi.ApplyFloor(0.0); // like ReLU,
+
+  CuMatrix<Real> Di(row, col);
+  CuMatrix<Real> Do(row, col);
+  Di.CopyFromMat(Hi);
+
+  Real target_rms = 0.3456;
+  bool add_log_stddev = false;
+  const Real kSquaredNormFloor = 1.35525271560688e-20; // 2^-66
+
+  //gpu
+  cu::NormalizePerRow(Di, target_rms, add_log_stddev, &Do);
+
+  //cpu
+  {
+    MatrixBase<Real>& in(Hi);
+    MatrixBase<Real>& out(Ho);
+    Real target_rms=0.3456;
+    Vector<Real> in_norm(in.NumRows());
+    Real d_scaled = in.NumCols() * target_rms * target_rms;
+    in_norm.AddDiagMat2(1.0 / d_scaled, in, kNoTrans, 0.0);
+    in_norm.ApplyFloor(kSquaredNormFloor);
+    in_norm.ApplyPow(-0.5);
+    out.CopyFromMat(in);
+    out.MulRowsVec(in_norm);
+  }
+
+  Matrix<Real> Ho2(Do);
+  // here the BUG was detected (by processing big-enough matrix),
+  AssertEqual(Ho,Ho2,0.00001);
+}
+
+
 template<typename Real>
 static void UnitTestCuDiffNormalizePerRow() {
   for (int32 i = 0; i < 2; i++) {
@@ -660,6 +704,7 @@ template<typename Real> void CudaMathUnitTest() {
   UnitTestEnsureNonzero<Real>();
   UnitTestBackpropLstmNonlinearity<Real>();
   UnitTestCuMathNormalizePerRow<Real>();
+  UnitTestCuMathNormalizePerRow_v2<Real>();
   UnitTestCuDiffNormalizePerRow<Real>();
 }
 
@@ -673,9 +718,9 @@ int main() {
   for (; loop < 2; loop++) {
     CuDevice::Instantiate().SetDebugStrideMode(true);
     if (loop == 0)
-      CuDevice::Instantiate().SelectGpuId("no"); // -1 means no GPU
+      CuDevice::Instantiate().SelectGpuId("no"); // 0 means no GPU
     else
-      CuDevice::Instantiate().SelectGpuId("yes"); // -2 .. automatic selection
+      CuDevice::Instantiate().SelectGpuId("yes"); // 1 .. automatic selection
 #endif
     srand(time(NULL));
     kaldi::CudaMathUnitTest<float>();
diff --git a/src/decoder/grammar-fst.cc b/src/decoder/grammar-fst.cc
index ab1a8142c1d..1b79e7b5521 100644
--- a/src/decoder/grammar-fst.cc
+++ b/src/decoder/grammar-fst.cc
@@ -706,7 +706,7 @@ bool GrammarFstPreparer::IsEntryState(StateId s) const {
     // we check that at least one has label with nonterminal equal to #nonterm_begin...
     // in fact they will all have this value if at least one does, and this was checked
     // in NeedEpsilons().
-    if (nonterminal == kNontermBegin)
+    if (nonterminal == GetPhoneSymbolFor(kNontermBegin))
       return true;
   }
   return false;
diff --git a/src/doc/online_decoding.dox b/src/doc/online_decoding.dox
index 799bfb5895f..dc04d9bef4e 100644
--- a/src/doc/online_decoding.dox
+++ b/src/doc/online_decoding.dox
@@ -438,6 +438,70 @@ and downloadable models that can be used with online nnet3 decoding, please
 see http://kaldi-asr.org/models.html (the first model there, the ASPIRE model,
 includes instructions in a README file).
 
+\subsection online_decoding_nnet3_tcp TCP server for nnet3 online decoding
+
+The program to run the TCP sever is online2-tcp-nnet3-decode-faster located in the
+~/src/online2bin folder. The usage is as follows:
+
+\verbatim
+online2-tcp-nnet3-decode-faster <nnet3-in> <fst-in> <word-symbol-table> <listen-port>
+\endverbatim
+
+For example:
+
+\verbatim
+online2-tcp-nnet3-decode-faster model/final.mdl graph/HCLG.fst graph/words.txt 5050
+\endverbatim
+
+The word symbol table is mandatory (unlike other nnet3 online decoding programs) because
+the server outputs word strings. Endpointing is mandatory to make the operation of the
+program reasonable. Other, non-standard options include:
+    - samp-freq - sampling frequency of audio (usually 8000 for telephony and 16000 for other uses)
+    - chunk-length - length of signal being processed by decoder at each step
+    - output-period - how often we check for changes in the decoding (ie. output refresh rate, default 1s)
+    - num-threads-startup - number of threads used when initializing iVector extractor
+
+The TCP protocol simply takes RAW signal on input (16-bit signed integer
+encoding at chosen sampling frequency) and outputs simple text using the following
+logic:
+    - each refresh period (output-freq argument) the current state of decoding is output
+    - each line is terminated by '\r'
+    - once an utterance boundary is detected due to endpointing a '\n' char is output
+
+Each output string (delimited by '\r') should be treated as uncertain and can change
+entirely until the utterance delimiter ('\n') is sent. The delimiter chars are chosen
+specifically in order to make the output look neat in the terminal. It is possible to
+use it with other interfaces and a web demo (HTML/JS AudioAPI+WebSockets) exists.
+
+To run the program from the terminal you can use one of the following commands. First,
+make sure the server is running and accepting connections. Using the Aspire models, the
+command should look like this:
+\verbatim
+online2-tcp-nnet3-decode-faster --samp-freq=8000 --frames-per-chunk=20 --extra-left-context-initial=0
+    --frame-subsampling-factor=3 --config=model/conf/online.conf --min-active=200 --max-active=7000
+    --beam=15.0 --lattice-beam=6.0 --acoustic-scale=1.0 model/final.mdl graph/HCLG.fst graph/words.txt 5050
+\endverbatim
+
+To send a WAV file into the server, it first needs to be decoded into raw audio, then it can be
+sent to the socket:
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | nc -N localhost 5050
+\endverbatim
+
+It is possible to play audio (almost) simultaneously as decoding. It may require installing the
+'pv' program (used to throttle the signal into Kaldi at the same speed as the playback):
+
+\verbatim
+sox audio.wav -t raw -c 1 -b 16 -r 8k -e signed-integer - | \
+    tee >(play -t raw -r 8k -e signed-integer -b 16 -c 1 -q -) | \
+    pv -L 16000 -q | nc -N localhost 5050
+\endverbatim
+
+Finally, it is possible to send audio from the microphone directly into the server:
+
+\verbatim
+rec -r 8k -e signed-integer -c 1 -b 16 -t raw -q - | nc -N localhost 5050
+\endverbatim
 
 
 */
diff --git a/src/feat/online-feature.cc b/src/feat/online-feature.cc
index 138dabe2236..3e8bf483694 100644
--- a/src/feat/online-feature.cc
+++ b/src/feat/online-feature.cc
@@ -24,7 +24,7 @@
 
 namespace kaldi {
 
-RecyclingVector::RecyclingVector(int items_to_hold) :
+RecyclingVector::RecyclingVector(int items_to_hold):
   items_to_hold_(items_to_hold == 0 ? -1 : items_to_hold),
   first_available_index_(0) {
 }
@@ -38,7 +38,8 @@ RecyclingVector::~RecyclingVector() {
 Vector<BaseFloat> *RecyclingVector::At(int index) const {
   if (index < first_available_index_) {
     KALDI_ERR << "Attempted to retrieve feature vector that was "
-                 "already removed by the RecyclingVector (index = " << index << "; "
+                 "already removed by the RecyclingVector (index = "
+              << index << "; "
               << "first_available_index = " << first_available_index_ << "; "
               << "size = " << Size() << ")";
   }
@@ -59,14 +60,13 @@ int RecyclingVector::Size() const {
   return first_available_index_ + items_.size();
 }
 
-
-template<class C>
+template <class C>
 void OnlineGenericBaseFeature<C>::GetFrame(int32 frame,
                                            VectorBase<BaseFloat> *feat) {
   feat->CopyFromVec(*(features_.At(frame)));
 };
 
-template<class C>
+template <class C>
 OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
     const typename C::Options &opts):
     computer_(opts),
@@ -77,29 +77,80 @@ OnlineGenericBaseFeature<C>::OnlineGenericBaseFeature(
                             &window_function_);
 }
 
-template<class C>
-void OnlineGenericBaseFeature<C>::AcceptWaveform(BaseFloat sampling_rate,
-                                                 const VectorBase<BaseFloat> &waveform) {
+
+template <class C>
+void OnlineGenericBaseFeature<C>::MaybeCreateResampler(
+    BaseFloat sampling_rate) {
   BaseFloat expected_sampling_rate = computer_.GetFrameOptions().samp_freq;
-  if (sampling_rate != expected_sampling_rate)
+
+  if (resampler_ != nullptr) {
+    KALDI_ASSERT(resampler_->GetInputSamplingRate() == sampling_rate);
+    KALDI_ASSERT(resampler_->GetOutputSamplingRate() == expected_sampling_rate);
+  } else if (((sampling_rate > expected_sampling_rate) &&
+              !computer_.GetFrameOptions().allow_downsample) ||
+             ((sampling_rate > expected_sampling_rate) &&
+              !computer_.GetFrameOptions().allow_upsample)) {
+    resampler_.reset(new LinearResample(
+        sampling_rate, expected_sampling_rate,
+        std::min(sampling_rate / 2, expected_sampling_rate / 2), 6));
+  } else if (sampling_rate != expected_sampling_rate) {
     KALDI_ERR << "Sampling frequency mismatch, expected "
-              << expected_sampling_rate << ", got " << sampling_rate;
-  if (waveform.Dim() == 0)
+              << expected_sampling_rate << ", got " << sampling_rate
+              << "\nPerhaps you want to use the options "
+                 "--allow_{upsample,downsample}";
+  }
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::InputFinished() {
+  if (resampler_ != nullptr) {
+    Vector<BaseFloat> appended_wave;
+    Vector<BaseFloat> resampled_wave;
+    resampler_->Resample(appended_wave, true, &resampled_wave);
+
+    if (waveform_remainder_.Dim() != 0)
+      appended_wave.Range(0, waveform_remainder_.Dim())
+          .CopyFromVec(waveform_remainder_);
+    appended_wave.Range(waveform_remainder_.Dim(), resampled_wave.Dim())
+        .CopyFromVec(resampled_wave);
+    waveform_remainder_.Swap(&appended_wave);
+  }
+  input_finished_ = true;
+  ComputeFeatures();
+}
+
+template <class C>
+void OnlineGenericBaseFeature<C>::AcceptWaveform(
+    BaseFloat sampling_rate, const VectorBase<BaseFloat> &original_waveform) {
+  if (original_waveform.Dim() == 0)
     return;  // Nothing to do.
   if (input_finished_)
     KALDI_ERR << "AcceptWaveform called after InputFinished() was called.";
-  // append 'waveform' to 'waveform_remainder_.'
-  Vector<BaseFloat> appended_wave(waveform_remainder_.Dim() + waveform.Dim());
+
+  Vector<BaseFloat> appended_wave;
+  Vector<BaseFloat> resampled_wave;
+
+  const VectorBase<BaseFloat> *waveform;
+
+  MaybeCreateResampler(sampling_rate);
+  if (resampler_ == nullptr) {
+    waveform = &original_waveform;
+  } else {
+    resampler_->Resample(original_waveform, false, &resampled_wave);
+    waveform = &resampled_wave;
+  }
+
+  appended_wave.Resize(waveform_remainder_.Dim() + waveform->Dim());
   if (waveform_remainder_.Dim() != 0)
-    appended_wave.Range(0, waveform_remainder_.Dim()).CopyFromVec(
-        waveform_remainder_);
-  appended_wave.Range(waveform_remainder_.Dim(), waveform.Dim()).CopyFromVec(
-      waveform);
+    appended_wave.Range(0, waveform_remainder_.Dim())
+        .CopyFromVec(waveform_remainder_);
+  appended_wave.Range(waveform_remainder_.Dim(), waveform->Dim())
+      .CopyFromVec(*waveform);
   waveform_remainder_.Swap(&appended_wave);
   ComputeFeatures();
 }
 
-template<class C>
+template <class C>
 void OnlineGenericBaseFeature<C>::ComputeFeatures() {
   const FrameExtractionOptions &frame_opts = computer_.GetFrameOptions();
   int64 num_samples_total = waveform_offset_ + waveform_remainder_.Dim();
@@ -145,7 +196,6 @@ void OnlineGenericBaseFeature<C>::ComputeFeatures() {
 template class OnlineGenericBaseFeature<MfccComputer>;
 template class OnlineGenericBaseFeature<FbankComputer>;
 
-
 OnlineCmvnState::OnlineCmvnState(const OnlineCmvnState &other):
     speaker_cmvn_stats(other.speaker_cmvn_stats),
     global_cmvn_stats(other.global_cmvn_stats),
@@ -173,8 +223,6 @@ void OnlineCmvnState::Read(std::istream &is, bool binary) {
   ExpectToken(is, binary, "</OnlineCmvnState>");
 }
 
-
-
 OnlineCmvn::OnlineCmvn(const OnlineCmvnOptions &opts,
                        const OnlineCmvnState &cmvn_state,
                        OnlineFeatureInterface *src):
@@ -328,7 +376,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
   // If count exceeded cmn_window it would be an error in how "window_stats"
   // was accumulated.
   KALDI_ASSERT(cur_count <= 1.001 * opts.cmn_window);
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (speaker_stats.NumRows() != 0) {  // if we have speaker stats..
     double count_from_speaker = opts.cmn_window - cur_count,
         speaker_count = speaker_stats(0, dim);
@@ -341,7 +390,8 @@ void OnlineCmvn::SmoothOnlineCmvnStats(const MatrixBase<double> &speaker_stats,
                              speaker_stats);
     cur_count = (*stats)(0, dim);
   }
-  if (cur_count >= opts.cmn_window) return;
+  if (cur_count >= opts.cmn_window)
+    return;
   if (global_stats.NumRows() != 0) {
     double count_from_global = opts.cmn_window - cur_count,
         global_count = global_stats(0, dim);
@@ -433,7 +483,7 @@ void OnlineCmvn::SetState(const OnlineCmvnState &cmvn_state) {
 
 int32 OnlineSpliceFrames::NumFramesReady() const {
   int32 num_frames = src_->NumFramesReady();
-  if (num_frames > 0 && src_->IsLastFrame(num_frames-1))
+  if (num_frames > 0 && src_->IsLastFrame(num_frames - 1))
     return num_frames;
   else
     return std::max<int32>(0, num_frames - right_context_);
diff --git a/src/feat/online-feature.h b/src/feat/online-feature.h
index 0ddc2601dec..2978d02f090 100644
--- a/src/feat/online-feature.h
+++ b/src/feat/online-feature.h
@@ -112,10 +112,7 @@ class OnlineGenericBaseFeature: public OnlineBaseFeature {
   // more waveform.  This will help flush out the last frame or two
   // of features, in the case where snip-edges == false; it also
   // affects the return value of IsLastFrame().
-  virtual void InputFinished() {
-    input_finished_ = true;
-    ComputeFeatures();
-  }
+  virtual void InputFinished();
 
  private:
   // This function computes any additional feature frames that it is possible to
diff --git a/src/feat/resample.h b/src/feat/resample.h
index ecac2ba7566..e0b4688c99b 100644
--- a/src/feat/resample.h
+++ b/src/feat/resample.h
@@ -185,6 +185,10 @@ class LinearResample {
   /// Resample(x, y, true) for the last piece.  Call it unnecessarily between
   /// signals will not do any harm.
   void Reset();
+
+  //// Return the input and output sampling rates (for checks, for example)
+  inline int32 GetInputSamplingRate() { return samp_rate_in_; }
+  inline int32 GetOutputSamplingRate() { return samp_rate_out_; }
  private:
   /// This function outputs the number of output samples we will output
   /// for a signal with "input_num_samp" input samples.  If flush == true,
diff --git a/src/fstext/context-fst.cc b/src/fstext/context-fst.cc
index 9936a398e37..1e41adc021f 100644
--- a/src/fstext/context-fst.cc
+++ b/src/fstext/context-fst.cc
@@ -345,7 +345,7 @@ SymbolTable *CreateILabelInfoSymbolTable(const vector<vector<int32> > &info,
                                          const SymbolTable &phones_symtab,
                                          std::string separator,
                                          std::string initial_disambig) {  // e.g. separator = "/", initial-disambig="#-1"
-  KALDI_ASSERT(!info.empty() && !info[0].empty());
+  KALDI_ASSERT(!info.empty() && info[0].empty());
   SymbolTable *ans = new SymbolTable("ilabel-info-symtab");
   int64 s = ans->AddSymbol(phones_symtab.Find(static_cast<int64>(0)));
   assert(s == 0);
diff --git a/src/nnet3/decodable-online-looped.cc b/src/nnet3/decodable-online-looped.cc
index 2159575df6c..751438606e8 100644
--- a/src/nnet3/decodable-online-looped.cc
+++ b/src/nnet3/decodable-online-looped.cc
@@ -30,6 +30,7 @@ DecodableNnetLoopedOnlineBase::DecodableNnetLoopedOnlineBase(
     num_chunks_computed_(0),
     current_log_post_subsampled_offset_(-1),
     info_(info),
+    frame_offset_(0),
     input_features_(input_features),
     ivector_features_(ivector_features),
     computer_(info_.opts.compute_config, info_.computation,
@@ -66,7 +67,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
   if (input_finished) {
     // if the input has finished,... we'll pad with duplicates of the last frame
     // as needed to get the required right context.
-    return (features_ready + sf - 1) / sf;
+    return (features_ready + sf - 1) / sf - frame_offset_;
   } else {
     // note: info_.right_context_ includes both the model context and any
     // extra_right_context_ (but this
@@ -78,7 +79,7 @@ int32 DecodableNnetLoopedOnlineBase::NumFramesReady() const {
     // doesn't need any attention to rounding because info_.frames_per_chunk
     // is always a multiple of 'sf' (see 'frames_per_chunk = GetChunksize..."
     // in decodable-simple-looped.cc).
-    return num_chunks_ready * info_.frames_per_chunk / sf;
+    return num_chunks_ready * info_.frames_per_chunk / sf - frame_offset_;
   }
 }
 
@@ -105,9 +106,14 @@ bool DecodableNnetLoopedOnlineBase::IsLastFrame(
     return false;
   int32 sf = info_.opts.frame_subsampling_factor,
      num_subsampled_frames_ready = (features_ready + sf - 1) / sf;
-  return (subsampled_frame == num_subsampled_frames_ready - 1);
+  return (subsampled_frame + frame_offset_ == num_subsampled_frames_ready - 1);
 }
 
+void DecodableNnetLoopedOnlineBase::SetFrameOffset(int32 frame_offset) {
+  KALDI_ASSERT(0 <= frame_offset &&
+               frame_offset <= frame_offset_ + NumFramesReady());
+  frame_offset_ = frame_offset;
+}
 
 void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
   // Prepare the input data for the next chunk of features.
@@ -231,6 +237,7 @@ void DecodableNnetLoopedOnlineBase::AdvanceChunk() {
 
 BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   // note: we index by 'inde
   return current_log_post_(
@@ -241,6 +248,7 @@ BaseFloat DecodableNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
 
 BaseFloat DecodableAmNnetLoopedOnline::LogLikelihood(int32 subsampled_frame,
                                                     int32 index) {
+  subsampled_frame += frame_offset_;
   EnsureFrameIsComputed(subsampled_frame);
   return current_log_post_(
       subsampled_frame - current_log_post_subsampled_offset_,
diff --git a/src/nnet3/decodable-online-looped.h b/src/nnet3/decodable-online-looped.h
index 4867c5decb8..f040b62516a 100644
--- a/src/nnet3/decodable-online-looped.h
+++ b/src/nnet3/decodable-online-looped.h
@@ -81,6 +81,17 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
     return info_.opts.frame_subsampling_factor;
   }
 
+  /// Sets the frame offset value. Frame offset is initialized to 0 when the
+  /// decodable object is constructed and stays as 0 unless this method is
+  /// called. This method is useful when we want to reset the decoder state,
+  /// i.e. call decoder.InitDecoding(), but we want to keep using the same
+  /// decodable object, e.g. in case of an endpoint. The frame offset affects
+  /// the behavior of IsLastFrame(), NumFramesReady() and LogLikelihood()
+  /// methods.
+  void SetFrameOffset(int32 frame_offset);
+
+  /// Returns the frame offset value.
+  int32 GetFrameOffset() const { return frame_offset_; }
 
  protected:
 
@@ -111,6 +122,11 @@ class DecodableNnetLoopedOnlineBase: public DecodableInterface {
 
   const DecodableNnetSimpleLoopedInfo &info_;
 
+  // IsLastFrame(), NumFramesReady() and LogLikelihood() methods take into
+  // account this offset value. We initialize frame_offset_ as 0 and it stays as
+  // 0 unless SetFrameOffset() method is called.
+  int32 frame_offset_;
+
  private:
 
   // This function does the computation for the next chunk.  It will change
diff --git a/src/online2/online-feature-pipeline.h b/src/online2/online-feature-pipeline.h
index f89cbbbb898..fab1be3cb27 100644
--- a/src/online2/online-feature-pipeline.h
+++ b/src/online2/online-feature-pipeline.h
@@ -166,7 +166,7 @@ class OnlineFeaturePipeline: public OnlineFeatureInterface {
 
   // This is supplied for debug purposes.
   void GetAsMatrix(Matrix<BaseFloat> *feats);
-  
+
   void FreezeCmvn();  // stop it from moving further (do this when you start
                       // using fMLLR). This will crash if NumFramesReady() == 0.
 
diff --git a/src/online2/online-nnet2-feature-pipeline.h b/src/online2/online-nnet2-feature-pipeline.h
index e379f7263ec..2e3fbf7bd78 100644
--- a/src/online2/online-nnet2-feature-pipeline.h
+++ b/src/online2/online-nnet2-feature-pipeline.h
@@ -196,6 +196,20 @@ class OnlineNnet2FeaturePipeline: public OnlineFeatureInterface {
   virtual int32 NumFramesReady() const;
   virtual void GetFrame(int32 frame, VectorBase<BaseFloat> *feat);
 
+  /// If you are downweighting silence, you can call
+  /// OnlineSilenceWeighting::GetDeltaWeights and supply the output to this
+  /// class using UpdateFrameWeights().  The reason why this call happens
+  /// outside this class, rather than this class pulling in the data weights,
+  /// relates to multi-threaded operation and also from not wanting this class
+  /// to have excessive dependencies.
+  ///
+  /// You must either always call this as soon as new data becomes available,
+  /// ideally just after calling AcceptWaveform(), or never call it for the
+  /// lifetime of this object.
+  void UpdateFrameWeights(
+      const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+      int32 frame_offset = 0);
+
   /// Set the adaptation state to a particular value, e.g. reflecting previous
   /// utterances of the same speaker; this will generally be called after
   /// Copy().
diff --git a/src/online2/online-nnet3-decoding.cc b/src/online2/online-nnet3-decoding.cc
index d40dcb411d1..08c8ba28060 100644
--- a/src/online2/online-nnet3-decoding.cc
+++ b/src/online2/online-nnet3-decoding.cc
@@ -41,6 +41,12 @@ SingleUtteranceNnet3DecoderTpl<FST>::SingleUtteranceNnet3DecoderTpl(
   decoder_.InitDecoding();
 }
 
+template <typename FST>
+void SingleUtteranceNnet3DecoderTpl<FST>::InitDecoding(int32 frame_offset) {
+  decoder_.InitDecoding();
+  decodable_.SetFrameOffset(frame_offset);
+}
+
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::AdvanceDecoding() {
   decoder_.AdvanceDecoding(&decodable_);
@@ -56,7 +62,6 @@ int32 SingleUtteranceNnet3DecoderTpl<FST>::NumFramesDecoded() const {
   return decoder_.NumFramesDecoded();
 }
 
-
 template <typename FST>
 void SingleUtteranceNnet3DecoderTpl<FST>::GetLattice(bool end_of_utterance,
                                              CompactLattice *clat) const {
diff --git a/src/online2/online-nnet3-decoding.h b/src/online2/online-nnet3-decoding.h
index b30f035b4d2..b80baad893f 100644
--- a/src/online2/online-nnet3-decoding.h
+++ b/src/online2/online-nnet3-decoding.h
@@ -60,7 +60,13 @@ class SingleUtteranceNnet3DecoderTpl {
                                  const FST &fst,
                                  OnlineNnet2FeaturePipeline *features);
 
-  /// advance the decoding as far as we can.
+  /// Initializes the decoding and sets the frame offset of the underlying
+  /// decodable object. This method is called by the constructor. You can also
+  /// call this method when you want to reset the decoder state, but want to
+  /// keep using the same decodable object, e.g. in case of an endpoint.
+  void InitDecoding(int32 frame_offset = 0);
+
+  /// Advances the decoding as far as we can.
   void AdvanceDecoding();
 
   /// Finalizes the decoding. Cleans up and prunes remaining tokens, so the
diff --git a/src/online2/online2-feature-pipeline.cc b/src/online2/online2-feature-pipeline.cc
index 510c401fba2..c495c9fc8ef 100644
--- a/src/online2/online2-feature-pipeline.cc
+++ b/src/online2/online2-feature-pipeline.cc
@@ -128,6 +128,21 @@ void OnlineNnet2FeaturePipeline::GetFrame(int32 frame,
   return final_feature_->GetFrame(frame, feat);
 }
 
+void OnlineNnet2FeaturePipeline::UpdateFrameWeights(
+    const std::vector<std::pair<int32, BaseFloat> > &delta_weights,
+    int32 frame_offset) {
+  if (frame_offset == 0) {
+    IvectorFeature()->UpdateFrameWeights(delta_weights);
+  } else {
+    std::vector<std::pair<int32, BaseFloat> > offset_delta_weights;
+    for (size_t i = 0; i < delta_weights.size(); i++) {
+      offset_delta_weights.push_back(std::make_pair(
+          delta_weights[i].first + frame_offset, delta_weights[i].second));
+    }
+    IvectorFeature()->UpdateFrameWeights(offset_delta_weights);
+  }
+}
+
 void OnlineNnet2FeaturePipeline::SetAdaptationState(
     const OnlineIvectorExtractorAdaptationState &adaptation_state) {
   if (info_.use_ivectors) {
diff --git a/src/online2bin/Makefile b/src/online2bin/Makefile
index 8792cc5b11a..28c135eb950 100644
--- a/src/online2bin/Makefile
+++ b/src/online2bin/Makefile
@@ -11,7 +11,8 @@ BINFILES = online2-wav-gmm-latgen-faster apply-cmvn-online \
      online2-wav-nnet2-latgen-faster ivector-extract-online2 \
      online2-wav-dump-features ivector-randomize \
      online2-wav-nnet2-am-compute  online2-wav-nnet2-latgen-threaded \
-     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar
+     online2-wav-nnet3-latgen-faster online2-wav-nnet3-latgen-grammar \
+     online2-tcp-nnet3-decode-faster
 
 OBJFILES =
 
diff --git a/src/online2bin/online2-tcp-nnet3-decode-faster.cc b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
new file mode 100644
index 00000000000..46e9cbc05be
--- /dev/null
+++ b/src/online2bin/online2-tcp-nnet3-decode-faster.cc
@@ -0,0 +1,442 @@
+// online2bin/online2-tcp-nnet3-decode-faster.cc
+
+// Copyright 2014  Johns Hopkins University (author: Daniel Povey)
+//           2016  Api.ai (Author: Ilya Platonov)
+//           2018  Polish-Japanese Academy of Information Technology (Author: Danijel Korzinek)
+
+// See ../../COPYING for clarification regarding multiple authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//  http://www.apache.org/licenses/LICENSE-2.0
+//
+// THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+// WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+// MERCHANTABLITY OR NON-INFRINGEMENT.
+// See the Apache 2 License for the specific language governing permissions and
+// limitations under the License.
+
+#include "feat/wave-reader.h"
+#include "online2/online-nnet3-decoding.h"
+#include "online2/online-nnet2-feature-pipeline.h"
+#include "online2/onlinebin-util.h"
+#include "online2/online-timing.h"
+#include "online2/online-endpoint.h"
+#include "fstext/fstext-lib.h"
+#include "lat/lattice-functions.h"
+#include "util/kaldi-thread.h"
+#include "nnet3/nnet-utils.h"
+
+#include <netinet/in.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <poll.h>
+#include <signal.h>
+#include <arpa/inet.h>
+#include <unistd.h>
+#include <string>
+
+namespace kaldi {
+
+class TcpServer {
+ public:
+  explicit TcpServer(int read_timeout);
+  ~TcpServer();
+
+  bool Listen(int32 port);  // start listening on a given port
+  int32 Accept();  // accept a client and return its descriptor
+
+  bool ReadChunk(size_t len); // get more data and return false if end-of-stream
+
+  Vector<BaseFloat> GetChunk(); // get the data read by above method
+
+  bool Write(const std::string &msg); // write to accepted client
+  bool WriteLn(const std::string &msg, const std::string &eol = "\n"); // write line to accepted client
+
+  void Disconnect();
+
+ private:
+  struct ::sockaddr_in h_addr_;
+  int32 server_desc_, client_desc_;
+  int16 *samp_buf_;
+  size_t buf_len_, has_read_;
+  pollfd client_set_[1];
+  int read_timeout_;
+};
+
+std::string LatticeToString(const Lattice &lat, const fst::SymbolTable &word_syms) {
+  LatticeWeight weight;
+  std::vector<int32> alignment;
+  std::vector<int32> words;
+  GetLinearSymbolSequence(lat, &alignment, &words, &weight);
+
+  std::ostringstream msg;
+  for (size_t i = 0; i < words.size(); i++) {
+    std::string s = word_syms.Find(words[i]);
+    if (s.empty()) {
+      KALDI_WARN << "Word-id " << words[i] << " not in symbol table.";
+      msg << "<#" << std::to_string(i) << "> ";
+    } else
+      msg << s << " ";
+  }
+  return msg.str();
+}
+
+std::string LatticeToString(const CompactLattice &clat, const fst::SymbolTable &word_syms) {
+  if (clat.NumStates() == 0) {
+    KALDI_WARN << "Empty lattice.";
+    return "";
+  }
+  CompactLattice best_path_clat;
+  CompactLatticeShortestPath(clat, &best_path_clat);
+
+  Lattice best_path_lat;
+  ConvertLattice(best_path_clat, &best_path_lat);
+  return LatticeToString(best_path_lat, word_syms);
+}
+}
+
+int main(int argc, char *argv[]) {
+  try {
+    using namespace kaldi;
+    using namespace fst;
+
+    typedef kaldi::int32 int32;
+    typedef kaldi::int64 int64;
+
+    const char *usage =
+        "Reads in audio from a network socket and performs online\n"
+        "decoding with neural nets (nnet3 setup), with iVector-based\n"
+        "speaker adaptation and endpointing.\n"
+        "Note: some configuration values and inputs are set via config\n"
+        "files whose filenames are passed as options\n"
+        "\n"
+        "Usage: online2-tcp-nnet3-decode-faster [options] <nnet3-in> "
+        "<fst-in> <word-symbol-table>\n";
+
+    ParseOptions po(usage);
+
+
+    // feature_opts includes configuration for the iVector adaptation,
+    // as well as the basic features.
+    OnlineNnet2FeaturePipelineConfig feature_opts;
+    nnet3::NnetSimpleLoopedComputationOptions decodable_opts;
+    LatticeFasterDecoderConfig decoder_opts;
+    OnlineEndpointConfig endpoint_opts;
+
+    BaseFloat chunk_length_secs = 0.18;
+    BaseFloat output_period = 1;
+    BaseFloat samp_freq = 16000.0;
+    int port_num = 5050;
+    int read_timeout = 3;
+
+    po.Register("samp-freq", &samp_freq,
+                "Sampling frequency of the input signal (coded as 16-bit slinear).");
+    po.Register("chunk-length", &chunk_length_secs,
+                "Length of chunk size in seconds, that we process.");
+    po.Register("output-period", &output_period,
+                "How often in seconds, do we check for changes in output.");
+    po.Register("num-threads-startup", &g_num_threads,
+                "Number of threads used when initializing iVector extractor.");
+    po.Register("read-timeout", &read_timeout,
+                "Number of seconds of timout for TCP audio data to appear on the stream. Use -1 for blocking.");
+    po.Register("port-num", &port_num,
+                "Port number the server will listen on.");
+
+    feature_opts.Register(&po);
+    decodable_opts.Register(&po);
+    decoder_opts.Register(&po);
+    endpoint_opts.Register(&po);
+
+    po.Read(argc, argv);
+
+    if (po.NumArgs() != 3) {
+      po.PrintUsage();
+      return 1;
+    }
+
+    std::string nnet3_rxfilename = po.GetArg(1),
+        fst_rxfilename = po.GetArg(2),
+        word_syms_filename = po.GetArg(3);
+
+    OnlineNnet2FeaturePipelineInfo feature_info(feature_opts);
+
+    KALDI_VLOG(1) << "Loading AM...";
+
+    TransitionModel trans_model;
+    nnet3::AmNnetSimple am_nnet;
+    {
+      bool binary;
+      Input ki(nnet3_rxfilename, &binary);
+      trans_model.Read(ki.Stream(), binary);
+      am_nnet.Read(ki.Stream(), binary);
+      SetBatchnormTestMode(true, &(am_nnet.GetNnet()));
+      SetDropoutTestMode(true, &(am_nnet.GetNnet()));
+      nnet3::CollapseModel(nnet3::CollapseModelConfig(), &(am_nnet.GetNnet()));
+    }
+
+    // this object contains precomputed stuff that is used by all decodable
+    // objects.  It takes a pointer to am_nnet because if it has iVectors it has
+    // to modify the nnet to accept iVectors at intervals.
+    nnet3::DecodableNnetSimpleLoopedInfo decodable_info(decodable_opts,
+                                                        &am_nnet);
+
+    KALDI_VLOG(1) << "Loading FST...";
+
+    fst::Fst<fst::StdArc> *decode_fst = ReadFstKaldiGeneric(fst_rxfilename);
+
+    fst::SymbolTable *word_syms = NULL;
+    if (!word_syms_filename.empty())
+      if (!(word_syms = fst::SymbolTable::ReadText(word_syms_filename)))
+        KALDI_ERR << "Could not read symbol table from file "
+                  << word_syms_filename;
+
+    signal(SIGPIPE, SIG_IGN); // ignore SIGPIPE to avoid crashing when socket forcefully disconnected
+
+    TcpServer server(read_timeout);
+
+    server.Listen(port_num);
+
+    while (true) {
+
+      server.Accept();
+
+      int32 samp_count = 0;// this is used for output refresh rate
+      size_t chunk_len = static_cast<size_t>(chunk_length_secs * samp_freq);
+      int32 check_period = static_cast<int32>(samp_freq * output_period);
+      int32 check_count = check_period;
+
+      int32 frame_offset = 0;
+
+      bool eos = false;
+
+      OnlineNnet2FeaturePipeline feature_pipeline(feature_info);
+      SingleUtteranceNnet3Decoder decoder(decoder_opts, trans_model,
+                                          decodable_info,
+                                          *decode_fst, &feature_pipeline);
+
+      while (!eos) {
+
+        decoder.InitDecoding(frame_offset);
+        OnlineSilenceWeighting silence_weighting(
+            trans_model,
+            feature_info.silence_weighting_config,
+            decodable_opts.frame_subsampling_factor);
+        std::vector<std::pair<int32, BaseFloat>> delta_weights;
+
+        while (true) {
+          eos = !server.ReadChunk(chunk_len);
+
+          if (eos) {
+            feature_pipeline.InputFinished();
+            decoder.AdvanceDecoding();
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            if (decoder.NumFramesDecoded() > 0) {
+              CompactLattice lat;
+              decoder.GetLattice(true, &lat);
+              std::string msg = LatticeToString(lat, *word_syms);
+              server.WriteLn(msg);
+            } else
+              server.Write("\n");
+            server.Disconnect();
+            break;
+          }
+
+          Vector<BaseFloat> wave_part = server.GetChunk();
+          feature_pipeline.AcceptWaveform(samp_freq, wave_part);
+          samp_count += chunk_len;
+
+          if (silence_weighting.Active() &&
+              feature_pipeline.IvectorFeature() != NULL) {
+            silence_weighting.ComputeCurrentTraceback(decoder.Decoder());
+            silence_weighting.GetDeltaWeights(feature_pipeline.NumFramesReady(),
+                                              &delta_weights);
+            feature_pipeline.UpdateFrameWeights(delta_weights,
+                                                frame_offset * decodable_opts.frame_subsampling_factor);
+          }
+
+          decoder.AdvanceDecoding();
+
+          if (samp_count > check_count) {
+            if (decoder.NumFramesDecoded() > 0) {
+              Lattice lat;
+              decoder.GetBestPath(false, &lat);
+              std::string msg = LatticeToString(lat, *word_syms);
+              server.WriteLn(msg, "\r");
+            }
+            check_count += check_period;
+          }
+
+          if (decoder.EndpointDetected(endpoint_opts)) {
+            decoder.FinalizeDecoding();
+            frame_offset += decoder.NumFramesDecoded();
+            CompactLattice lat;
+            decoder.GetLattice(true, &lat);
+            std::string msg = LatticeToString(lat, *word_syms);
+            server.WriteLn(msg);
+            break;
+          }
+        }
+      }
+    }
+  } catch (const std::exception &e) {
+    std::cerr << e.what();
+    return -1;
+  }
+} // main()
+
+
+namespace kaldi {
+TcpServer::TcpServer(int read_timeout) {
+  server_desc_ = -1;
+  client_desc_ = -1;
+  samp_buf_ = NULL;
+  buf_len_ = 0;
+  read_timeout_ = 1000 * read_timeout;
+}
+
+bool TcpServer::Listen(int32 port) {
+  h_addr_.sin_addr.s_addr = INADDR_ANY;
+  h_addr_.sin_port = htons(port);
+  h_addr_.sin_family = AF_INET;
+
+  server_desc_ = socket(AF_INET, SOCK_STREAM, 0);
+
+  if (server_desc_ == -1) {
+    KALDI_ERR << "Cannot create TCP socket!";
+    return false;
+  }
+
+  int32 flag = 1;
+  int32 len = sizeof(int32);
+  if (setsockopt(server_desc_, SOL_SOCKET, SO_REUSEADDR, &flag, len) == -1) {
+    KALDI_ERR << "Cannot set socket options!";
+    return false;
+  }
+
+  if (bind(server_desc_, (struct sockaddr *) &h_addr_, sizeof(h_addr_)) == -1) {
+    KALDI_ERR << "Cannot bind to port: " << port << " (is it taken?)";
+    return false;
+  }
+
+  if (listen(server_desc_, 1) == -1) {
+    KALDI_ERR << "Cannot listen on port!";
+    return false;
+  }
+
+  KALDI_LOG << "TcpServer: Listening on port: " << port;
+
+  return true;
+
+}
+
+TcpServer::~TcpServer() {
+  Disconnect();
+  if (server_desc_ != -1)
+    close(server_desc_);
+  delete[] samp_buf_;
+}
+
+int32 TcpServer::Accept() {
+  KALDI_LOG << "Waiting for client...";
+
+  socklen_t len;
+
+  len = sizeof(struct sockaddr);
+  client_desc_ = accept(server_desc_, (struct sockaddr *) &h_addr_, &len);
+
+  struct sockaddr_storage addr;
+  char ipstr[20];
+
+  len = sizeof addr;
+  getpeername(client_desc_, (struct sockaddr *) &addr, &len);
+
+  struct sockaddr_in *s = (struct sockaddr_in *) &addr;
+  inet_ntop(AF_INET, &s->sin_addr, ipstr, sizeof ipstr);
+
+  client_set_[0].fd = client_desc_;
+  client_set_[0].events = POLLIN;
+
+  KALDI_LOG << "Accepted connection from: " << ipstr;
+
+  return client_desc_;
+}
+
+bool TcpServer::ReadChunk(size_t len) {
+  if (buf_len_ != len) {
+    buf_len_ = len;
+    delete[] samp_buf_;
+    samp_buf_ = new int16[len];
+  }
+
+  ssize_t ret;
+  int poll_ret;
+  size_t to_read = len;
+  has_read_ = 0;
+  while (to_read > 0) {
+    poll_ret = poll(client_set_, 1, read_timeout_);
+    if (poll_ret == 0) {
+      KALDI_WARN << "Socket timeout! Disconnecting...";
+      break;
+    }
+    if (client_set_[0].revents != POLLIN) {
+      KALDI_WARN << "Socket error! Disconnecting...";
+      break;
+    }
+    ret = read(client_desc_, static_cast<void *>(samp_buf_ + has_read_), to_read * sizeof(int16));
+    if (ret <= 0) {
+      KALDI_WARN << "Stream over...";
+      break;
+    }
+    to_read -= ret / sizeof(int16);
+    has_read_ += ret / sizeof(int16);
+  }
+
+  return has_read_ > 0;
+}
+
+Vector<BaseFloat> TcpServer::GetChunk() {
+  Vector<BaseFloat> buf;
+
+  buf.Resize(static_cast<MatrixIndexT>(has_read_));
+
+  for (int i = 0; i < has_read_; i++)
+    buf(i) = static_cast<BaseFloat>(samp_buf_[i]);
+
+  return buf;
+}
+
+bool TcpServer::Write(const std::string &msg) {
+
+  const char *p = msg.c_str();
+  size_t to_write = msg.size();
+  size_t wrote = 0;
+  while (to_write > 0) {
+    ssize_t ret = write(client_desc_, static_cast<const void *>(p + wrote), to_write);
+    if (ret <= 0)
+      return false;
+
+    to_write -= ret;
+    wrote += ret;
+  }
+
+  return true;
+}
+
+bool TcpServer::WriteLn(const std::string &msg, const std::string &eol) {
+  if (Write(msg))
+    return Write(eol);
+  else return false;
+}
+
+void TcpServer::Disconnect() {
+  if (client_desc_ != -1) {
+    close(client_desc_);
+    client_desc_ = -1;
+  }
+}
+}  // namespace kaldi
\ No newline at end of file
diff --git a/src/onlinebin/online-gmm-decode-faster.cc b/src/onlinebin/online-gmm-decode-faster.cc
index 9aa8751cf50..dac7657ea57 100644
--- a/src/onlinebin/online-gmm-decode-faster.cc
+++ b/src/onlinebin/online-gmm-decode-faster.cc
@@ -70,7 +70,7 @@ int main(int argc, char *argv[]) {
     OnlineFeatureMatrixOptions feature_reading_opts;
     decoder_opts.Register(&po, true);
     feature_reading_opts.Register(&po);
-    
+
     po.Register("left-context", &left_context, "Number of frames of left context");
     po.Register("right-context", &right_context, "Number of frames of right context");
     po.Register("acoustic-scale", &acoustic_scale,
@@ -86,7 +86,7 @@ int main(int argc, char *argv[]) {
       po.PrintUsage();
       return 1;
     }
-    
+
     std::string model_rxfilename = po.GetArg(1),
         fst_rxfilename = po.GetArg(2),
         word_syms_filename = po.GetArg(3),
@@ -151,7 +151,7 @@ int main(int argc, char *argv[]) {
       opts.order = kDeltaOrder;
       feat_transform = new OnlineDeltaInput(opts, &cmn_input);
     }
-    
+
     // feature_reading_opts contains number of retries, batch size.
     OnlineFeatureMatrix feature_matrix(feature_reading_opts,
                                        feat_transform);
@@ -200,4 +200,4 @@ int main(int argc, char *argv[]) {
     return -1;
   }
 #endif
-} // main()
+}  // main()
diff --git a/src/util/edit-distance-inl.h b/src/util/edit-distance-inl.h
index c1d1682804c..3304b27d0bf 100644
--- a/src/util/edit-distance-inl.h
+++ b/src/util/edit-distance-inl.h
@@ -35,8 +35,8 @@ int32 LevenshteinEditDistance(const std::vector<T> &a,
   //  elements a_0 ... a_{M-1} and b_0 ... b_{N-1}.
   //  We are computing the recursion
   //     E(m, n) = min(  E(m-1, n-1) + (1-delta(a_{m-1}, b_{n-1})),
-  //                    E(m-1, n),
-  //                    E(m, n-1) ).
+  //                    E(m-1, n) + 1,
+  //                    E(m, n-1) + 1).
   //  where E(m, n) is defined for m = 0..M and n = 0..N and out-of-
   //  bounds quantities are considered to be infinity (i.e. the
   //  recursion does not visit them).
diff --git a/tools/extras/install_portaudio.sh b/tools/extras/install_portaudio.sh
index 58797f554e8..36c95047a7f 100755
--- a/tools/extras/install_portaudio.sh
+++ b/tools/extras/install_portaudio.sh
@@ -14,10 +14,10 @@
 #See the Apache 2 License for the specific language governing permissions and
 #limitations under the License.
 #
-#This script attempts to install port audio, which is needed for the run-on 
-#decoding stuff. Portaudio enables the decoder to grab a live audio stream 
-#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and 
-#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7, 
+#This script attempts to install port audio, which is needed for the run-on
+#decoding stuff. Portaudio enables the decoder to grab a live audio stream
+#from the soundcard. I tested portaudio on Linux (RedHat and Suse Linux) and
+#on MacOS 10.7. On Linux, it compiles out of the box. For MacOS 10.7,
 #it is necessary to edit the Makefile (this script tries to do that).
 #The script will remove all occurances of
 #
@@ -29,8 +29,8 @@
 #also, it seems that one has to uncomment the inclusion of AudioToolbox in
 #include/pa_mac_core.h
 #
-#All this should make it compile fine for x86_64 under MacOS 10.7 
-#(always assuming that you installed XCode, wget and 
+#All this should make it compile fine for x86_64 under MacOS 10.7
+#(always assuming that you installed XCode, wget and
 #the Linux environment stuff on MacOS)
 
 echo "****() Installing portaudio"
@@ -38,7 +38,7 @@ echo "****() Installing portaudio"
 if [ ! -e pa_stable_v19_20111121.tgz ]; then
     echo "Could not find portaudio tarball pa_stable_v19_20111121.tgz"
     echo "Trying to download it via wget!"
-    
+
     if ! which wget >&/dev/null; then
         echo "This script requires you to first install wget"
         echo "You can also just download pa_stable_v19_20111121.tgz from"
@@ -81,6 +81,8 @@ if [ -z "$MACOS" ]; then
     echo "${pa_patch}" | patch -p0 Makefile.in
 fi
 
+patch -p0  Makefile.in < ../extras/portaudio.patch
+autoconf
 ./configure --prefix=`pwd`/install --with-pic
 perl -i -pe 's:src/common/pa_ringbuffer.o:: if /^OTHER_OBJS\s*=/' Makefile
 
@@ -93,7 +95,7 @@ if [ "$MACOS" != "" ]; then
     mv include/pa_mac_core.h include/pa_mac_core.h.bck
     cat include/pa_mac_core.h.bck \
       | sed 's/\/\/\#include \<AudioToolbox\/AudioToolbox.h\>/#include \<AudioToolbox\/AudioToolbox.h\>/g' \
-      > include/pa_mac_core.h 
+      > include/pa_mac_core.h
 fi
 
 make
diff --git a/tools/extras/portaudio.patch b/tools/extras/portaudio.patch
new file mode 100644
index 00000000000..9fc201f9278
--- /dev/null
+++ b/tools/extras/portaudio.patch
@@ -0,0 +1,21 @@
+diff --git a/Makefile.in b/Makefile.in
+index 24129a3..61a3952 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -44,7 +44,7 @@ PALIB = libportaudio.la
+ PAINC = include/portaudio.h
+ 
+ PA_LDFLAGS = $(LDFLAGS) $(SHARED_FLAGS) -rpath $(libdir) -no-undefined \
+-	     -export-symbols-regex "(Pa|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
++	     -export-symbols-regex "(Pa|PaUtil|PaMacCore|PaJack|PaAlsa|PaAsio|PaOSS)_.*" \
+ 	     -version-info $(LT_CURRENT):$(LT_REVISION):$(LT_AGE)
+ 
+ COMMON_OBJS = \
+@@ -57,6 +57,7 @@ COMMON_OBJS = \
+ 	src/common/pa_process.o \
+ 	src/common/pa_stream.o \
+ 	src/common/pa_trace.o \
++	src/common/pa_ringbuffer.o \
+ 	src/hostapi/skeleton/pa_hostapi_skeleton.o
+ 
+ LOOPBACK_OBJS = \