Skip to content

Commit

Permalink
Merge pull request #2294 from bertsky/lstm-with-char-whitelist
Browse files Browse the repository at this point in the history
trying to add tessedit_char_whitelist etc. again:
  • Loading branch information
zdenop authored Apr 6, 2019
2 parents be617b3 + f80508b commit ab09b09
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 26 deletions.
12 changes: 11 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,4 +67,14 @@ your question has been asked (and has been answered) many times before...

## For Developers: Creating a Pull Request

TBD
You should always make sure your changes build and run successfully.

For that, your clone needs to have all submodules (`abseil`, `googletest`, `test`) included. To do so, either specify `--recurse-submodules` during the initial clone, or run `git submodule update --init --recursive NAME` for each `NAME` later. If `configure` already created those directories (blocking the clone), remove them first (or `make distclean`), then clone and reconfigure.

Have a look at [the README](./README.md) and [testing README](./test/testing/README.md) and the [wiki page](https://github.com/tesseract-ocr/tesseract/wiki/Compiling-%E2%80%93-GitInstallation#unit-test-builds) on installation.

In short, after running `configure` from the build directory of your choice, to build the library and CLI, run `make`. To test it, run `make check`. To build the training tools, run `make training`.

As soon as your changes are building and tests are succeeding, you can publish them. If you have not already, please [fork](https://guides.github.com/activities/forking/) tesseract (somewhere) on GitHub, and push your changes to that fork (in a new branch). Then [submit as PR](https://help.github.com/en/articles/creating-a-pull-request-from-a-fork).

Please also keep track of reports from CI (automated build status) and Coverity/LGTM (quality scan). When the indicators show deterioration after your changes, further action may be required to improve them.
12 changes: 12 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -621,11 +621,23 @@ void Tesseract::SetBlackAndWhitelist() {
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
if (lstm_recognizer_) {
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
}
// Black and white lists should apply to all loaded classifiers.
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->unicharset.set_black_and_whitelist(
tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
if (sub_langs_[i]->lstm_recognizer_) {
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (sub_langs_[i]->lstm_recognizer_->GetUnicharset());
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
}
}
}

Expand Down
1 change: 1 addition & 0 deletions src/ccutil/unicharset.h
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,7 @@ class UNICHARSET {

// Return the enabled property of the given unichar.
bool get_enabled(UNICHAR_ID unichar_id) const {
ASSERT_HOST(contains_unichar_id(unichar_id));
return unichars[unichar_id].properties.enabled;
}

Expand Down
19 changes: 12 additions & 7 deletions src/lstm/recodebeam.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,10 +521,10 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
if (t == 0) {
// The first step can only use singles and initials.
ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2,
dict_ratio, cert_offset, worst_dict_cert, step);
charset, dict_ratio, cert_offset, worst_dict_cert, step);
if (dict_ != nullptr) {
ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs,
TN_TOP2, dict_ratio, cert_offset, worst_dict_cert, step);
ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, TN_TOP2,
charset, dict_ratio, cert_offset, worst_dict_cert, step);
}
} else {
RecodeBeam* prev = beam_[t - 1];
Expand Down Expand Up @@ -556,9 +556,8 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
// best first, but it comes before a lot of the worst, so it is slightly
// more efficient than going forwards.
for (int i = prev->beams_[index].size() - 1; i >= 0; --i) {
ContinueContext(&prev->beams_[index].get(i).data, index, outputs,
top_n, dict_ratio, cert_offset, worst_dict_cert,
step);
ContinueContext(&prev->beams_[index].get(i).data, index, outputs, top_n,
charset, dict_ratio, cert_offset, worst_dict_cert, step);
}
}
for (int index = 0; index < kNumBeams; ++index) {
Expand All @@ -585,7 +584,9 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
// choices for which top_n_flags[index] == top_n_flag.
void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index,
const float* outputs,
TopNState top_n_flag, double dict_ratio,
TopNState top_n_flag,
const UNICHARSET* charset,
double dict_ratio,
double cert_offset,
double worst_dict_cert,
RecodeBeam* step) {
Expand Down Expand Up @@ -648,6 +649,10 @@ void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index,
int unichar_id = recoder_.DecodeUnichar(full_code);
// Map the null char to INVALID.
if (length == 0 && code == null_char_) unichar_id = INVALID_UNICHAR_ID;
if (unichar_id != INVALID_UNICHAR_ID &&
charset != nullptr &&
!charset->get_enabled(unichar_id))
continue; // disabled by whitelist/blacklist
ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio,
use_dawgs, NC_ANYTHING, prev, step);
if (top_n_flag == TN_TOP2 && code != null_char_) {
Expand Down
6 changes: 3 additions & 3 deletions src/lstm/recodebeam.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,9 +312,9 @@ class RecodeBeamSearch {
// using the given network outputs to provide scores to the choices. Uses only
// those choices for which top_n_flags[code] == top_n_flag.
void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
TopNState top_n_flag, double dict_ratio,
double cert_offset, double worst_dict_cert,
RecodeBeam* step);
TopNState top_n_flag, const UNICHARSET* unicharset,
double dict_ratio, double cert_offset,
double worst_dict_cert, RecodeBeam* step);
// Continues for a new unichar, using dawg or non-dawg as per flag.
void ContinueUnichar(int code, int unichar_id, float cert,
float worst_dict_cert, float dict_ratio, bool use_dawgs,
Expand Down
21 changes: 6 additions & 15 deletions src/training/tesstrain_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,23 +70,14 @@ err_exit() {
# if the program file is not found.
# Usage: run_command CMD ARG1 ARG2...
run_command() {
local cmd=$(which $1)
if [[ -z ${cmd} ]]; then
for d in api training; do
cmd=$(which $d/$1)
if [[ ! -z ${cmd} ]]; then
break
fi
done
if [[ -z ${cmd} ]]; then
err_exit "$1 not found"
fi
fi
local cmd
cmd=$(which $1 || \
for d in api training; do
which $d/$1 && break
done) || err_exit "'$1' not found"
shift
tlog "[$(date)] ${cmd} $@"
"${cmd}" "$@" 2>&1 1>&2 | tee -a ${LOG_FILE}
# check completion status
if [[ $? -gt 0 ]]; then
if ! "${cmd}" "$@" |& tee -a ${LOG_FILE}; then
err_exit "Program $(basename ${cmd}) failed. Abort."
fi
}
Expand Down

0 comments on commit ab09b09

Please sign in to comment.