Merge pull request #2294 from bertsky/lstm-with-char-whitelist

trying to add tessedit_char_whitelist etc. again:
tesseract-ocr · Apr 6, 2019 · ab09b09 · ab09b09
2 parents be617b3 + f80508b
commit ab09b09
Show file tree

Hide file tree

Showing 6 changed files with 45 additions and 26 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -67,4 +67,14 @@ your question has been asked (and has been answered) many times before...
 
 ## For Developers: Creating a Pull Request
 
-TBD
+You should always make sure your changes build and run successfully. 
+
+For that, your clone needs to have all submodules (`abseil`, `googletest`, `test`) included. To do so, either specify `--recurse-submodules` during the initial clone, or run `git submodule update --init --recursive NAME` for each `NAME` later. If `configure` already created those directories (blocking the clone), remove them first (or `make distclean`), then clone and reconfigure. 
+
+Have a look at [the README](./README.md) and [testing README](./test/testing/README.md) and the [wiki page](https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation#unit-test-builds) on installation. 
+
+In short, after running `configure` from the build directory of your choice, to build the library and CLI, run `make`. To test it, run `make check`. To build the training tools, run `make training`.
+
+As soon as your changes are building and tests are succeeding, you can publish them. If you have not already, please [fork](https://guides.github.com/activities/forking/) tesseract (somewhere) on GitHub, and push your changes to that fork (in a new branch). Then [submit as PR](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork).
+
+Please also keep track of reports from CI (automated build status) and Coverity/LGTM (quality scan). When the indicators show deterioration after your changes, further action may be required to improve them.
diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp
@@ -621,11 +621,23 @@ void Tesseract::SetBlackAndWhitelist() {
   unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
                                      tessedit_char_whitelist.string(),
                                      tessedit_char_unblacklist.string());
+  if (lstm_recognizer_) {
+    UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
+    lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
+                                            tessedit_char_whitelist.string(),
+                                            tessedit_char_unblacklist.string());
+  }
   // Black and white lists should apply to all loaded classifiers.
   for (int i = 0; i < sub_langs_.size(); ++i) {
     sub_langs_[i]->unicharset.set_black_and_whitelist(
         tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
         tessedit_char_unblacklist.string());
+    if (sub_langs_[i]->lstm_recognizer_) {
+      UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (sub_langs_[i]->lstm_recognizer_->GetUnicharset());
+      lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
+                                              tessedit_char_whitelist.string(),
+                                              tessedit_char_unblacklist.string());
+    }
   }
 }
 

diff --git a/src/ccutil/unicharset.h b/src/ccutil/unicharset.h
@@ -871,6 +871,7 @@ class UNICHARSET {
 
   // Return the enabled property of the given unichar.
   bool get_enabled(UNICHAR_ID unichar_id) const {
+    ASSERT_HOST(contains_unichar_id(unichar_id));
     return unichars[unichar_id].properties.enabled;
   }
 

diff --git a/src/lstm/recodebeam.cpp b/src/lstm/recodebeam.cpp
@@ -521,10 +521,10 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
   if (t == 0) {
     // The first step can only use singles and initials.
     ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2,
-                    dict_ratio, cert_offset, worst_dict_cert, step);
+                    charset, dict_ratio, cert_offset, worst_dict_cert, step);
     if (dict_ != nullptr) {
-      ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs,
-                      TN_TOP2, dict_ratio, cert_offset, worst_dict_cert, step);
+      ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2,
+                      charset, dict_ratio, cert_offset, worst_dict_cert, step);
     }
   } else {
     RecodeBeam* prev = beam_[t - 1];
@@ -556,9 +556,8 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
         // best first, but it comes before a lot of the worst, so it is slightly
         // more efficient than going forwards.
         for (int i = prev->beams_[index].size() - 1; i >= 0; --i) {
-          ContinueContext(&prev->beams_[index].get(i).data, index, outputs,
-                          top_n, dict_ratio, cert_offset, worst_dict_cert,
-                          step);
+          ContinueContext(&prev->beams_[index].get(i).data, index, outputs, top_n,
+                          charset, dict_ratio, cert_offset, worst_dict_cert, step);
         }
       }
       for (int index = 0; index < kNumBeams; ++index) {
@@ -585,7 +584,9 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
 // choices for which top_n_flags[index] == top_n_flag.
 void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index,
                                        const float* outputs,
-                                       TopNState top_n_flag, double dict_ratio,
+                                       TopNState top_n_flag,
+                                       const UNICHARSET* charset,
+                                       double dict_ratio,
                                        double cert_offset,
                                        double worst_dict_cert,
                                        RecodeBeam* step) {
@@ -648,6 +649,10 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index,
       int unichar_id = recoder_.DecodeUnichar(full_code);
       // Map the null char to INVALID.
       if (length == 0 && code == null_char_) unichar_id = INVALID_UNICHAR_ID;
+      if (unichar_id != INVALID_UNICHAR_ID &&
+          charset != nullptr &&
+          !charset->get_enabled(unichar_id))
+        continue; // disabled by whitelist/blacklist
       ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio,
                       use_dawgs, NC_ANYTHING, prev, step);
       if (top_n_flag == TN_TOP2 && code != null_char_) {

diff --git a/src/lstm/recodebeam.h b/src/lstm/recodebeam.h
@@ -312,9 +312,9 @@ class RecodeBeamSearch {
   // using the given network outputs to provide scores to the choices. Uses only
   // those choices for which top_n_flags[code] == top_n_flag.
   void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
-                       TopNState top_n_flag, double dict_ratio,
-                       double cert_offset, double worst_dict_cert,
-                       RecodeBeam* step);
+                       TopNState top_n_flag, const UNICHARSET* unicharset,
+                       double dict_ratio, double cert_offset,
+                       double worst_dict_cert, RecodeBeam* step);
   // Continues for a new unichar, using dawg or non-dawg as per flag.
   void ContinueUnichar(int code, int unichar_id, float cert,
                        float worst_dict_cert, float dict_ratio, bool use_dawgs,

diff --git a/src/training/tesstrain_utils.sh b/src/training/tesstrain_utils.sh
@@ -70,23 +70,14 @@ err_exit() {
 # if the program file is not found.
 # Usage: run_command CMD ARG1 ARG2...
 run_command() {
-    local cmd=$(which $1)
-    if [[ -z ${cmd} ]]; then
-      for d in api training; do
-        cmd=$(which $d/$1)
-        if [[ ! -z ${cmd} ]]; then
-          break
-        fi
-      done
-      if [[ -z ${cmd} ]]; then
-          err_exit "$1 not found"
-      fi
-    fi
+    local cmd
+    cmd=$(which $1 || \
+              for d in api training; do
+                  which $d/$1 && break
+              done) || err_exit "'$1' not found"
     shift
     tlog "[$(date)] ${cmd} $@"
-    "${cmd}" "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
-    # check completion status
-    if [[ $? -gt 0 ]]; then
+    if ! "${cmd}" "$@" |& tee -a ${LOG_FILE}; then
         err_exit "Program $(basename ${cmd}) failed. Abort."
     fi
 }