From 70993585106155af617ef93306539e854a4ce19b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Zdenko=20Podobn=C3=BD?= Date: Sun, 4 Dec 2016 22:06:52 +0100 Subject: [PATCH] backport from 4.00: training --- training/boxchar.cpp | 12 +- training/boxchar.h | 3 + training/cntraining.cpp | 42 ++++--- training/commontraining.cpp | 10 +- training/degradeimage.cpp | 163 +++++++++++++++++++++++++ training/degradeimage.h | 28 ++++- training/fileio.cpp | 2 +- training/mftraining.cpp | 6 +- training/normstrngs.cpp | 8 +- training/normstrngs.h | 9 +- training/pango_font_info.cpp | 129 +++++++++---------- training/pango_font_info.h | 31 +++-- training/stringrenderer.cpp | 20 +-- training/unicharset_training_utils.cpp | 7 +- training/unicharset_training_utils.h | 8 +- 15 files changed, 357 insertions(+), 121 deletions(-) diff --git a/training/boxchar.cpp b/training/boxchar.cpp index b99c12a600..4324597744 100644 --- a/training/boxchar.cpp +++ b/training/boxchar.cpp @@ -73,7 +73,6 @@ void BoxChar::PrepareToWrite(vector* boxes) { if (rtl_rules) { ReorderRTLText(boxes); } - tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules); } // Inserts newline (tab) characters into the vector at newline positions. @@ -291,13 +290,19 @@ const int kMaxLineLength = 1024; /* static */ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, const vector& boxes) { + string output = GetTesseractBoxStr(height, boxes); + File::WriteStringToFileOrDie(output, filename); +} + +/* static */ +string BoxChar::GetTesseractBoxStr(int height, const vector& boxes) { string output; char buffer[kMaxLineLength]; for (int i = 0; i < boxes.size(); ++i) { const Box* box = boxes[i]->box_; if (box == NULL) { tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); - return; + return ""; } int nbytes = snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", @@ -305,6 +310,7 @@ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, box->x + box->w, height - box->y, boxes[i]->page_); output.append(buffer, nbytes); } - File::WriteStringToFileOrDie(output, filename); + return output; } + } // namespace tesseract diff --git a/training/boxchar.h b/training/boxchar.h index 27b568a143..3748c4abad 100644 --- a/training/boxchar.h +++ b/training/boxchar.h @@ -100,6 +100,9 @@ class BoxChar { // is needed to convert to tesseract coordinates. static void WriteTesseractBoxFile(const string& name, int height, const vector& boxes); + // Gets the tesseract box file as a string from the vector of boxes. + // The image height is needed to convert to tesseract coordinates. + static string GetTesseractBoxStr(int height, const vector& boxes); private: string ch_; diff --git a/training/cntraining.cpp b/training/cntraining.cpp index 916a758576..6f4f42aebe 100644 --- a/training/cntraining.cpp +++ b/training/cntraining.cpp @@ -52,8 +52,8 @@ int main ( Private Function Prototypes ----------------------------------------------------------------------------*/ -void WriteNormProtos (const char *Directory, LIST LabeledProtoList, - CLUSTERER *Clusterer); +void WriteNormProtos(const char *Directory, LIST LabeledProtoList, + const FEATURE_DESC_STRUCT *feature_desc); /* PARAMDESC *ConvertToPARAMDESC( @@ -160,13 +160,18 @@ int main(int argc, char *argv[]) { // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; + // The norm protos will count the source protos, so we keep them here in + // freeable_protos, so they can be freed later. + GenericVector freeable_protos; iterate(pCharList) { //Cluster - if (Clusterer) - FreeClusterer(Clusterer); CharSample = (LABELEDLIST)first_node(pCharList); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); + if (Clusterer == NULL) { // To avoid a SIGSEGV + fprintf(stderr, "Error: NULL clusterer!\n"); + return 1; + } float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: @@ -185,21 +190,21 @@ int main(int argc, char *argv[]) { } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); + freeable_protos.push_back(ProtoList); + FreeClusterer(Clusterer); } FreeTrainingSamples(CharList); - if (Clusterer == NULL) { // To avoid a SIGSEGV - fprintf(stderr, "Error: NULL clusterer!\n"); - return 1; - } - WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer); + int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); + WriteNormProtos(FLAGS_D.c_str(), NormProtoList, + FeatureDefs.FeatureDesc[desc_index]); FreeNormProtoList(NormProtoList); - FreeProtoList(&ProtoList); - FreeClusterer(Clusterer); + for (int i = 0; i < freeable_protos.size(); ++i) { + FreeProtoList(&freeable_protos[i]); + } printf ("\n"); return 0; } // main - /*---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------*/ @@ -211,14 +216,13 @@ int main(int argc, char *argv[]) { * of the samples. * @param Directory directory to place sample files into * @param LabeledProtoList List of labeled protos -* @param Clusterer The CLUSTERER to use +* @param feature_desc Description of the features * @return none * @note Exceptions: none * @note History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ -void WriteNormProtos(const char *Directory, LIST LabeledProtoList, - CLUSTERER *Clusterer) -{ +void WriteNormProtos(const char *Directory, LIST LabeledProtoList, + const FEATURE_DESC_STRUCT *feature_desc) { FILE *File; STRING Filename; LABELEDLIST LabeledProto; @@ -233,8 +237,8 @@ void WriteNormProtos(const char *Directory, LIST LabeledProtoList, Filename += "normproto"; printf ("\nWriting %s ...", Filename.string()); File = Efopen (Filename.string(), "wb"); - fprintf(File, "%0d\n", Clusterer->SampleSize); - WriteParamDesc(File, Clusterer->SampleSize,Clusterer->ParamDesc); + fprintf(File, "%0d\n", feature_desc->NumParams); + WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc); iterate(LabeledProtoList) { LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); @@ -249,7 +253,7 @@ void WriteNormProtos(const char *Directory, LIST LabeledProtoList, exit(1); } fprintf(File, "\n%s %d\n", LabeledProto->Label, N); - WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false); + WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false); } fclose (File); diff --git a/training/commontraining.cpp b/training/commontraining.cpp index b7243e6f3f..f77a553de4 100644 --- a/training/commontraining.cpp +++ b/training/commontraining.cpp @@ -40,7 +40,6 @@ using tesseract::CCUtil; using tesseract::IntFeatureSpace; -using tesseract::FontInfo; using tesseract::ParamUtils; using tesseract::ShapeTable; @@ -453,6 +452,7 @@ void FreeTrainingSamples(LIST CharList) { FEATURE_SET FeatureSet; LIST FeatureList; + LIST nodes = CharList; iterate(CharList) { /* iterate through all of the fonts */ char_sample = (LABELEDLIST) first_node(CharList); FeatureList = char_sample->List; @@ -462,7 +462,7 @@ void FreeTrainingSamples(LIST CharList) { } FreeLabeledList(char_sample); } - destroy(CharList); + destroy(nodes); } /* FreeTrainingSamples */ /*---------------------------------------------------------------------------*/ @@ -728,6 +728,7 @@ MERGE_CLASS NewLabeledClass(const char* Label) { void FreeLabeledClassList(LIST ClassList) { MERGE_CLASS MergeClass; + LIST nodes = ClassList; iterate(ClassList) /* iterate through all of the fonts */ { MergeClass = (MERGE_CLASS) first_node (ClassList); @@ -735,7 +736,7 @@ void FreeLabeledClassList(LIST ClassList) { FreeClass(MergeClass->Class); delete MergeClass; } - destroy(ClassList); + destroy(nodes); } /* FreeLabeledClassList */ @@ -825,12 +826,13 @@ void FreeNormProtoList(LIST CharList) { LABELEDLIST char_sample; + LIST nodes = CharList; iterate(CharList) /* iterate through all of the fonts */ { char_sample = (LABELEDLIST) first_node (CharList); FreeLabeledList (char_sample); } - destroy(CharList); + destroy(nodes); } // FreeNormProtoList diff --git a/training/degradeimage.cpp b/training/degradeimage.cpp index f9c3cfb048..333f3703dc 100644 --- a/training/degradeimage.cpp +++ b/training/degradeimage.cpp @@ -22,10 +22,36 @@ #include #include "allheaders.h" // from leptonica +#include "genericvector.h" #include "helpers.h" // For TRand. +#include "rect.h" namespace tesseract { +// A randomized perspective distortion can be applied to synthetic input. +// The perspective distortion comes from leptonica, which uses 2 sets of 4 +// corners to determine the distortion. There are random values for each of +// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead +// defined in terms of a single shear value. This reduces the degrees of +// freedom enough to make the distortion more realistic than it would otherwise +// be if all 8 coordinates could move independently. +// One additional factor is used for the color of the pixels that don't exist +// in the source image. +// Name for each of the randomizing factors. +enum FactorNames { + FN_INCOLOR, + FN_Y0, + FN_Y1, + FN_Y2, + FN_Y3, + FN_X0, + FN_X1, + FN_SHEAR, + // x2 = x1 - shear + // x3 = x0 + shear + FN_NUM_FACTORS +}; + // Rotation is +/- kRotationRange radians. const float kRotationRange = 0.02f; // Number of grey levels to shift by for each exposure step. @@ -144,4 +170,141 @@ Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, return input; } +// Creates and returns a Pix distorted by various means according to the bool +// flags. If boxes is not NULL, the boxes are resized/positioned according to +// any spatial distortion and also by the integer reduction factor box_scale +// so they will match what the network will output. +// Returns NULL on error. The returned Pix must be pixDestroyed. +Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, + bool white_noise, bool smooth_noise, bool blur, + int box_reduction, TRand* randomizer, + GenericVector* boxes) { + Pix* distorted = pixCopy(NULL, const_cast(pix)); + // Things to do to synthetic training data. + if (invert && randomizer->SignedRand(1.0) < 0) + pixInvert(distorted, distorted); + if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) { + // TODO(rays) Cook noise in a more thread-safe manner than rand(). + // Attempt to make the sequences reproducible. + srand(randomizer->IntRand()); + Pix* pixn = pixAddGaussianNoise(distorted, 8.0); + pixDestroy(&distorted); + if (smooth_noise) { + distorted = pixBlockconv(pixn, 1, 1); + pixDestroy(&pixn); + } else { + distorted = pixn; + } + } + if (blur && randomizer->SignedRand(1.0) > 0.0) { + Pix* blurred = pixBlockconv(distorted, 1, 1); + pixDestroy(&distorted); + distorted = blurred; + } + if (perspective) + GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes); + if (boxes != NULL) { + for (int b = 0; b < boxes->size(); ++b) { + (*boxes)[b].scale(1.0f / box_reduction); + if ((*boxes)[b].width() <= 0) + (*boxes)[b].set_right((*boxes)[b].left() + 1); + } + } + return distorted; +} + +// Distorts anything that has a non-null pointer with the same pseudo-random +// perspective distortion. Width and height only need to be set if there +// is no pix. If there is a pix, then they will be taken from there. +void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, + Pix** pix, GenericVector* boxes) { + if (pix != NULL && *pix != NULL) { + width = pixGetWidth(*pix); + height = pixGetHeight(*pix); + } + float* im_coeffs = NULL; + float* box_coeffs = NULL; + l_int32 incolor = + ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs); + if (pix != NULL && *pix != NULL) { + // Transform the image. + Pix* transformed = pixProjective(*pix, im_coeffs, incolor); + if (transformed == NULL) { + tprintf("Projective transformation failed!!\n"); + return; + } + pixDestroy(pix); + *pix = transformed; + } + if (boxes != NULL) { + // Transform the boxes. + for (int b = 0; b < boxes->size(); ++b) { + int x1, y1, x2, y2; + const TBOX& box = (*boxes)[b]; + projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1, + &y1); + projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(), + &x2, &y2); + TBOX new_box1(x1, height - y2, x2, height - y1); + projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(), + &x1, &y1); + projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2, + &y2); + TBOX new_box2(x1, height - y1, x2, height - y2); + (*boxes)[b] = new_box1.bounding_union(new_box2); + } + } + free(im_coeffs); + free(box_coeffs); +} + +// Computes the coefficients of a randomized projective transformation. +// The image transform requires backward transformation coefficient, and the +// box transform the forward coefficients. +// Returns the incolor arg to pixProjective. +int ProjectiveCoeffs(int width, int height, TRand* randomizer, + float** im_coeffs, float** box_coeffs) { + // Setup "from" points. + Pta* src_pts = ptaCreate(4); + ptaAddPt(src_pts, 0.0f, 0.0f); + ptaAddPt(src_pts, width, 0.0f); + ptaAddPt(src_pts, width, height); + ptaAddPt(src_pts, 0.0f, height); + // Extract factors from pseudo-random sequence. + float factors[FN_NUM_FACTORS]; + float shear = 0.0f; // Shear is signed. + for (int i = 0; i < FN_NUM_FACTORS; ++i) { + // Everything is squared to make wild values rarer. + if (i == FN_SHEAR) { + // Shear is signed. + shear = randomizer->SignedRand(0.5 / 3.0); + shear = shear >= 0.0 ? shear * shear : -shear * shear; + // Keep the sheared points within the original rectangle. + if (shear < -factors[FN_X0]) shear = -factors[FN_X0]; + if (shear > factors[FN_X1]) shear = factors[FN_X1]; + factors[i] = shear; + } else if (i != FN_INCOLOR) { + factors[i] = fabs(randomizer->SignedRand(1.0)); + if (i <= FN_Y3) + factors[i] *= 5.0 / 8.0; + else + factors[i] *= 0.5; + factors[i] *= factors[i]; + } + } + // Setup "to" points. + Pta* dest_pts = ptaCreate(4); + ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height); + ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height); + ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width, + (1 - factors[FN_Y2]) * height); + ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width, + (1 - factors[FN_Y3]) * height); + getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs); + getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs); + ptaDestroy(&src_pts); + ptaDestroy(&dest_pts); + return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK; +} + } // namespace tesseract diff --git a/training/degradeimage.h b/training/degradeimage.h index 2add6282f8..a7af9565ff 100644 --- a/training/degradeimage.h +++ b/training/degradeimage.h @@ -20,12 +20,13 @@ #ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_ #define TESSERACT_TRAINING_DEGRADEIMAGE_H_ -struct Pix; +#include "allheaders.h" +#include "genericvector.h" +#include "helpers.h" // For TRand. +#include "rect.h" namespace tesseract { -class TRand; - // Degrade the pix as if by a print/copy/scan cycle with exposure > 0 // corresponding to darkening on the copier and <0 lighter and 0 not copied. // If rotation is not NULL, the clockwise rotation in radians is saved there. @@ -34,6 +35,27 @@ class TRand; struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer, float* rotation); +// Creates and returns a Pix distorted by various means according to the bool +// flags. If boxes is not NULL, the boxes are resized/positioned according to +// any spatial distortion and also by the integer reduction factor box_scale +// so they will match what the network will output. +// Returns NULL on error. The returned Pix must be pixDestroyed. +Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, + bool white_noise, bool smooth_noise, bool blur, + int box_reduction, TRand* randomizer, + GenericVector* boxes); +// Distorts anything that has a non-null pointer with the same pseudo-random +// perspective distortion. Width and height only need to be set if there +// is no pix. If there is a pix, then they will be taken from there. +void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, + Pix** pix, GenericVector* boxes); +// Computes the coefficients of a randomized projective transformation. +// The image transform requires backward transformation coefficient, and the +// box transform the forward coefficients. +// Returns the incolor arg to pixProjective. +int ProjectiveCoeffs(int width, int height, TRand* randomizer, + float** im_coeffs, float** box_coeffs); + } // namespace tesseract #endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_ diff --git a/training/fileio.cpp b/training/fileio.cpp index f82582da74..bb1f4afcef 100644 --- a/training/fileio.cpp +++ b/training/fileio.cpp @@ -81,7 +81,7 @@ bool File::ReadFileToString(const string& filename, string* out) { } string File::JoinPath(const string& prefix, const string& suffix) { - return (!prefix.size() || prefix[prefix.size() - 1] == '/') + return (prefix.empty() || prefix[prefix.size() - 1] == '/') ? prefix + suffix : prefix + "/" + suffix; } diff --git a/training/mftraining.cpp b/training/mftraining.cpp index 60314a1cdf..9e2e250927 100644 --- a/training/mftraining.cpp +++ b/training/mftraining.cpp @@ -64,9 +64,6 @@ #include "tprintf.h" #include "unicity_table.h" -using tesseract::Classify; -using tesseract::FontInfo; -using tesseract::FontSpacingInfo; using tesseract::IndexMapBiDi; using tesseract::MasterTrainer; using tesseract::Shape; @@ -305,6 +302,9 @@ int main (int argc, char **argv) { *shape_table, float_classes, inttemp_file.string(), pffmtable_file.string()); + for (int c = 0; c < unicharset->size(); ++c) { + FreeClassFields(&float_classes[c]); + } delete [] float_classes; FreeLabeledClassList(mf_classes); delete trainer; diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp index acffeee13d..e7cac21f4b 100644 --- a/training/normstrngs.cpp +++ b/training/normstrngs.cpp @@ -113,12 +113,12 @@ bool is_double_quote(const char32 ch) { return false; } -STRING NormalizeUTF8String(const char* str8) { +STRING NormalizeUTF8String(bool decompose, const char* str8) { GenericVector str32, out_str32, norm_str; UTF8ToUTF32(str8, &str32); for (int i = 0; i < str32.length(); ++i) { norm_str.clear(); - NormalizeChar32(str32[i], &norm_str); + NormalizeChar32(str32[i], decompose, &norm_str); for (int j = 0; j < norm_str.length(); ++j) { out_str32.push_back(norm_str[j]); } @@ -128,10 +128,10 @@ STRING NormalizeUTF8String(const char* str8) { return out_str8; } -void NormalizeChar32(char32 ch, GenericVector* str) { +void NormalizeChar32(char32 ch, bool decompose, GenericVector* str) { IcuErrorCode error_code; const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance( - NULL, "nfkc", UNORM2_COMPOSE, error_code); + NULL, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, error_code); error_code.assertSuccess(); error_code.reset(); diff --git a/training/normstrngs.h b/training/normstrngs.h index 71e7b8da08..6fca3193ab 100644 --- a/training/normstrngs.h +++ b/training/normstrngs.h @@ -39,11 +39,16 @@ void UTF32ToUTF8(const GenericVector& str32, STRING* utf8_str); // assumption of this function is that the input is already as fully composed // as it can be, but may require some compatibility normalizations or just // OCR evaluation related normalizations. -void NormalizeChar32(char32 ch, GenericVector* str); +void NormalizeChar32(char32 ch, bool decompose, GenericVector* str); // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that // can contain multiple UTF32 code points. -STRING NormalizeUTF8String(const char* str8); +STRING NormalizeUTF8String(bool decompose, const char* str8); +// Default behavior is to compose, until it is proven that decomposed benefits +// at least one language. +inline STRING NormalizeUTF8String(const char* str8) { + return NormalizeUTF8String(false, str8); +} // Apply just the OCR-specific normalizations and return the normalized char. char32 OCRNormalize(char32 ch); diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp index 6ca8c8998f..41e352eae4 100644 --- a/training/pango_font_info.cpp +++ b/training/pango_font_info.cpp @@ -60,15 +60,6 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir"); -BOOL_PARAM_FLAG(fontconfig_refresh_cache, false, - "Does a one-time deletion of cache files from the " - "fontconfig_tmpdir before initializing fontconfig."); -BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true, - "Does a one-time reset of the fontconfig config file to point" - " to fonts_dir before initializing fontconfig. Set to true" - " if fontconfig_refresh_cache is true. Set it to false to use" - " multiple instances in separate processes without having to" - " rescan the fonts_dir, using a previously setup font cache"); #ifndef USE_STD_NAMESPACE #include "ocr/trainingdata/typesetting/legacy_fonts.h" @@ -91,7 +82,8 @@ namespace tesseract { // in pixels. const int kDefaultResolution = 300; -bool PangoFontInfo::fontconfig_initialized_ = false; +string PangoFontInfo::fonts_dir_; +string PangoFontInfo::cache_dir_; PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) { Clear(); @@ -119,6 +111,8 @@ void PangoFontInfo::Clear() { } } +PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); } + string PangoFontInfo::DescriptionName() const { if (!desc_) return ""; char* desc_str = pango_font_description_to_string(desc_); @@ -127,59 +121,63 @@ string PangoFontInfo::DescriptionName() const { return desc_name; } -// Initializes Fontconfig for use by writing a fake fonts.conf file into the -// FLAGS_fontconfigs_tmpdir directory, that points to the supplied -// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable -// to point to this fonts.conf file. If force_clear, the cache is refreshed -// even if it has already been initialized. -void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) { - if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) { - fontconfig_initialized_ = true; - return; - } - if (FLAGS_fontconfig_refresh_cache || force_clear) { - File::DeleteMatchingFiles(File::JoinPath( - FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str()); - } - if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache || - force_clear) { - const int MAX_FONTCONF_FILESIZE = 1024; - char fonts_conf_template[MAX_FONTCONF_FILESIZE]; - snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, - "\n" - "\n" - "\n" - "%s\n" - "%s\n" - "\n" - "", fonts_dir.c_str(), - FLAGS_fontconfig_tmpdir.c_str()); - string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), - "fonts.conf"); - File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); +// If not already initialized, initializes FontConfig by setting its +// environment variable and creating a fonts.conf file that points to the +// FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir. +/* static */ +void PangoFontInfo::SoftInitFontConfig() { + if (fonts_dir_.empty()) { + HardInitFontConfig(FLAGS_fonts_dir.c_str(), + FLAGS_fontconfig_tmpdir.c_str()); } +} + +// Re-initializes font config, whether or not already initialized. +// If already initialized, any existing cache is deleted, just to be sure. +/* static */ +void PangoFontInfo::HardInitFontConfig(const string& fonts_dir, + const string& cache_dir) { + if (!cache_dir_.empty()) { + File::DeleteMatchingFiles( + File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str()); + } + const int MAX_FONTCONF_FILESIZE = 1024; + char fonts_conf_template[MAX_FONTCONF_FILESIZE]; + cache_dir_ = cache_dir; + fonts_dir_ = fonts_dir; + snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, + "\n" + "\n" + "\n" + "%s\n" + "%s\n" + "\n" + "", + fonts_dir.c_str(), cache_dir_.c_str()); + string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf"); + File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); #ifdef _WIN32 std::string env("FONTCONFIG_PATH="); - env.append(FLAGS_fontconfig_tmpdir.c_str()); + env.append(cache_dir_.c_str()); putenv(env.c_str()); putenv("LANG=en_US.utf8"); #else - setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true); + setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true); // Fix the locale so that the reported font names are consistent. setenv("LANG", "en_US.utf8", true); #endif // _WIN32 - if (!fontconfig_initialized_ || force_clear) { - if (FcInitReinitialize() != FcTrue) { - tprintf("FcInitiReinitialize failed!!\n"); - } + + if (FcInitReinitialize() != FcTrue) { + tprintf("FcInitiReinitialize failed!!\n"); } - fontconfig_initialized_ = true; FontUtils::ReInit(); + // Clear Pango's font cache too. + pango_cairo_font_map_set_default(NULL); } static void ListFontFamilies(PangoFontFamily*** families, int* n_families) { - PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); + PangoFontInfo::SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); DISABLE_HEAP_LEAK_CHECK; pango_font_map_list_families(font_map, families, n_families); @@ -253,7 +251,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) { // in the font map. Note that if the font is wholly missing, this could // correspond to a completely different font family and face. PangoFont* PangoFontInfo::ToPangoFont() const { - InitFontConfig(false, FLAGS_fonts_dir.c_str()); + SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_cairo_context_set_resolution(context, resolution_); @@ -437,10 +435,15 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, PangoGlyph dotted_circle_glyph; PangoFont* font = run->item->analysis.font; - PangoGlyphString * glyphs = pango_glyph_string_new(); +#ifdef _WIN32 // Fixme! Leaks memory and breaks unittests. + PangoGlyphString* glyphs = pango_glyph_string_new(); char s[] = "\xc2\xa7"; pango_shape(s, sizeof(s), &(run->item->analysis), glyphs); dotted_circle_glyph = glyphs->glyphs[0].glyph; +#else + dotted_circle_glyph = pango_fc_font_get_glyph( + reinterpret_cast(font), kDottedCircleGlyph); +#endif if (TLOG_IS_ON(2)) { PangoFontDescription* desc = pango_font_describe(font); @@ -519,22 +522,21 @@ vector FontUtils::available_fonts_; // cache list bool FontUtils::IsAvailableFont(const char* input_query_desc, string* best_match) { string query_desc(input_query_desc); - if (PANGO_VERSION <= 12005) { - // Strip commas and any ' Medium' substring in the name. - query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','), - query_desc.end()); - const string kMediumStr = " Medium"; - std::size_t found = query_desc.find(kMediumStr); - if (found != std::string::npos) { - query_desc.erase(found, kMediumStr.length()); - } +#if (PANGO_VERSION <= 12005) + // Strip commas and any ' Medium' substring in the name. + query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','), + query_desc.end()); + const string kMediumStr = " Medium"; + std::size_t found = query_desc.find(kMediumStr); + if (found != std::string::npos) { + query_desc.erase(found, kMediumStr.length()); } - +#endif PangoFontDescription *desc = pango_font_description_from_string( query_desc.c_str()); PangoFont* selected_font = NULL; { - PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); + PangoFontInfo::SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_context_set_font_map(context, font_map); @@ -589,7 +591,7 @@ static bool ShouldIgnoreFontFamilyName(const char* query) { // Outputs description names of available fonts. /* static */ const vector& FontUtils::ListAvailableFonts() { - if (available_fonts_.size()) { + if (!available_fonts_.empty()) { return available_fonts_; } #ifndef USE_STD_NAMESPACE @@ -687,8 +689,7 @@ void FontUtils::GetAllRenderableCharacters(const vector& fonts, /* static */ int FontUtils::FontScore(const TessHashMap& ch_map, - const string& fontname, - int* raw_score, + const string& fontname, int* raw_score, vector* ch_flags) { PangoFontInfo font_info; if (!font_info.ParseFontDescriptionName(fontname)) { diff --git a/training/pango_font_info.h b/training/pango_font_info.h index fc46fcf48b..09a43fab14 100644 --- a/training/pango_font_info.h +++ b/training/pango_font_info.h @@ -24,10 +24,16 @@ #include #include +#include "commandlineflags.h" #include "hashfn.h" #include "host.h" -#include "util.h" #include "pango/pango-font.h" +#include "pango/pango.h" +#include "pango/pangocairo.h" +#include "util.h" + +DECLARE_STRING_PARAM_FLAG(fonts_dir); +DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir); typedef signed int char32; @@ -44,6 +50,7 @@ class PangoFontInfo { DECORATIVE, }; PangoFontInfo(); + ~PangoFontInfo(); // Initialize from parsing a font description name, defined as a string of the // format: // "FamilyName [FaceName] [PointSize]" @@ -83,10 +90,14 @@ class PangoFontInfo { bool GetSpacingProperties(const string& utf8_char, int* x_bearing, int* x_advance) const; - // Initializes FontConfig by setting its environment variable and creating - // a fonts.conf file that points to the given fonts_dir. Once initialized, - // it is not re-initialized unless force_clear is true. - static void InitFontConfig(bool force_clear, const string& fonts_dir); + // If not already initialized, initializes FontConfig by setting its + // environment variable and creating a fonts.conf file that points to the + // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir. + static void SoftInitFontConfig(); + // Re-initializes font config, whether or not already initialized. + // If already initialized, any existing cache is deleted, just to be sure. + static void HardInitFontConfig(const string& fonts_dir, + const string& cache_dir); // Accessors string DescriptionName() const; @@ -130,8 +141,14 @@ class PangoFontInfo { int resolution_; // Fontconfig operates through an environment variable, so it intrinsically // cannot be thread-friendly, but you can serialize multiple independent - // font configurations by calling InitFontConfig(true, path). - static bool fontconfig_initialized_; + // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir). + // These hold the last initialized values set by HardInitFontConfig or + // the first call to SoftInitFontConfig. + // Directory to be scanned for font files. + static string fonts_dir_; + // Directory to store the cache of font information. (Can be the same as + // fonts_dir_) + static string cache_dir_; private: PangoFontInfo(const PangoFontInfo&); diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index 66bbf7d28e..e7f9699f18 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -347,6 +347,11 @@ void StringRenderer::ClearBoxes() { boxaDestroy(&page_boxes_); } +string StringRenderer::GetBoxesStr() { + BoxChar::PrepareToWrite(&boxchars_); + return BoxChar::GetTesseractBoxStr(page_height_, boxchars_); +} + void StringRenderer::WriteAllBoxes(const string& filename) { BoxChar::PrepareToWrite(&boxchars_); BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); @@ -395,7 +400,7 @@ bool StringRenderer::GetClusterStrings(vector* cluster_text) { it != start_byte_to_text.end(); ++it) { cluster_text->push_back(it->second); } - return cluster_text->size(); + return !cluster_text->empty(); } // Merges an array of BoxChars into words based on the identification of @@ -495,7 +500,7 @@ void StringRenderer::ComputeClusterBoxes() { const int end_byte_index = cluster_start_to_end_index[start_byte_index]; string cluster_text = string(text + start_byte_index, end_byte_index - start_byte_index); - if (cluster_text.size() && cluster_text[0] == '\n') { + if (!cluster_text.empty() && cluster_text[0] == '\n') { tlog(2, "Skipping newlines at start of text.\n"); continue; } @@ -595,11 +600,12 @@ void StringRenderer::ComputeClusterBoxes() { all_boxes = boxaCreate(0); boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE); } - boxaGetExtent(all_boxes, NULL, NULL, &page_box); - boxaDestroy(&all_boxes); - if (page_boxes_ == NULL) - page_boxes_ = boxaCreate(0); - boxaAddBox(page_boxes_, page_box, L_INSERT); + if (all_boxes != NULL) { + boxaGetExtent(all_boxes, NULL, NULL, &page_box); + boxaDestroy(&all_boxes); + if (page_boxes_ == NULL) page_boxes_ = boxaCreate(0); + boxaAddBox(page_boxes_, page_box, L_INSERT); + } } diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp index 10aaf0e6c3..efa3a22cd5 100644 --- a/training/unicharset_training_utils.cpp +++ b/training/unicharset_training_utils.cpp @@ -37,7 +37,8 @@ namespace tesseract { // Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. -void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { +void SetupBasicProperties(bool report_errors, bool decompose, + UNICHARSET* unicharset) { for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { // Convert any custom ligatures. const char* unichar_str = unicharset->id_to_unichar(unichar_id); @@ -129,7 +130,7 @@ void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { } // Record normalized version of this unichar. - STRING normed_str = tesseract::NormalizeUTF8String(unichar_str); + STRING normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str); if (unichar_id != 0 && normed_str.length() > 0) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { @@ -158,7 +159,7 @@ void SetPropertiesForInputFile(const string& script_dir, // Set unichar properties tprintf("Setting unichar properties\n"); - SetupBasicProperties(true, &unicharset); + SetupBasicProperties(true, false, &unicharset); string xheights_str; for (int s = 0; s < unicharset.get_script_table_size(); ++s) { // Load the unicharset for the script if available. diff --git a/training/unicharset_training_utils.h b/training/unicharset_training_utils.h index ff2262875d..f03e12ace4 100644 --- a/training/unicharset_training_utils.h +++ b/training/unicharset_training_utils.h @@ -33,7 +33,13 @@ namespace tesseract { // Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. -void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset); +void SetupBasicProperties(bool report_errors, bool decompose, + UNICHARSET* unicharset); +// Default behavior is to compose, until it is proven that decomposed benefits +// at least one language. +inline void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { + SetupBasicProperties(report_errors, false, unicharset); +} // Helper to set the properties for an input unicharset file, writes to the // output file. If an appropriate script unicharset can be found in the