From 70993585106155af617ef93306539e854a4ce19b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zdenko=20Podobn=C3=BD?= <zdenop@gmail.com>
Date: Sun, 4 Dec 2016 22:06:52 +0100
Subject: [PATCH] backport from 4.00: training

---
 training/boxchar.cpp                   |  12 +-
 training/boxchar.h                     |   3 +
 training/cntraining.cpp                |  42 ++++---
 training/commontraining.cpp            |  10 +-
 training/degradeimage.cpp              | 163 +++++++++++++++++++++++++
 training/degradeimage.h                |  28 ++++-
 training/fileio.cpp                    |   2 +-
 training/mftraining.cpp                |   6 +-
 training/normstrngs.cpp                |   8 +-
 training/normstrngs.h                  |   9 +-
 training/pango_font_info.cpp           | 129 +++++++++----------
 training/pango_font_info.h             |  31 +++--
 training/stringrenderer.cpp            |  20 +--
 training/unicharset_training_utils.cpp |   7 +-
 training/unicharset_training_utils.h   |   8 +-
 15 files changed, 357 insertions(+), 121 deletions(-)
diff --git a/training/boxchar.cpp b/training/boxchar.cpp
index b99c12a600..4324597744 100644
--- a/training/boxchar.cpp
+++ b/training/boxchar.cpp
@@ -73,7 +73,6 @@ void BoxChar::PrepareToWrite(vector<BoxChar*>* boxes) {
   if (rtl_rules) {
     ReorderRTLText(boxes);
   }
-  tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules);
 }
 
 // Inserts newline (tab) characters into the vector at newline positions.
@@ -291,13 +290,19 @@ const int kMaxLineLength = 1024;
 /* static */
 void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
                                     const vector<BoxChar*>& boxes) {
+  string output = GetTesseractBoxStr(height, boxes);
+  File::WriteStringToFileOrDie(output, filename);
+}
+
+/* static */
+string BoxChar::GetTesseractBoxStr(int height, const vector<BoxChar*>& boxes) {
   string output;
   char buffer[kMaxLineLength];
   for (int i = 0; i < boxes.size(); ++i) {
     const Box* box = boxes[i]->box_;
     if (box == NULL) {
       tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n");
-      return;
+      return "";
     }
     int nbytes =
         snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n",
@@ -305,6 +310,7 @@ void BoxChar::WriteTesseractBoxFile(const string& filename, int height,
                  box->x + box->w, height - box->y, boxes[i]->page_);
     output.append(buffer, nbytes);
   }
-  File::WriteStringToFileOrDie(output, filename);
+  return output;
 }
+
 }  // namespace tesseract
diff --git a/training/boxchar.h b/training/boxchar.h
index 27b568a143..3748c4abad 100644
--- a/training/boxchar.h
+++ b/training/boxchar.h
@@ -100,6 +100,9 @@ class BoxChar {
   // is needed to convert to tesseract coordinates.
   static void WriteTesseractBoxFile(const string& name, int height,
                                     const vector<BoxChar*>& boxes);
+  // Gets the tesseract box file as a string from the vector of boxes.
+  // The image height is needed to convert to tesseract coordinates.
+  static string GetTesseractBoxStr(int height, const vector<BoxChar*>& boxes);
 
  private:
   string ch_;
diff --git a/training/cntraining.cpp b/training/cntraining.cpp
index 916a758576..6f4f42aebe 100644
--- a/training/cntraining.cpp
+++ b/training/cntraining.cpp
@@ -52,8 +52,8 @@ int main (
           Private Function Prototypes
 ----------------------------------------------------------------------------*/
 
-void WriteNormProtos (const char  *Directory, LIST  LabeledProtoList,
-                      CLUSTERER *Clusterer);
+void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
+                     const FEATURE_DESC_STRUCT *feature_desc);
 
 /*
 PARAMDESC *ConvertToPARAMDESC(
@@ -160,13 +160,18 @@ int main(int argc, char *argv[]) {
   // reduce the min samples:
   // Config.MinSamples = 0.5 / num_fonts;
   pCharList = CharList;
+  // The norm protos will count the source protos, so we keep them here in
+  // freeable_protos, so they can be freed later.
+  GenericVector<LIST> freeable_protos;
   iterate(pCharList) {
     //Cluster
-    if (Clusterer)
-       FreeClusterer(Clusterer);
     CharSample = (LABELEDLIST)first_node(pCharList);
     Clusterer =
       SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
+    if (Clusterer == NULL) {  // To avoid a SIGSEGV
+      fprintf(stderr, "Error: NULL clusterer!\n");
+      return 1;
+    }
     float SavedMinSamples = Config.MinSamples;
     // To disable the tendency to produce a single cluster for all fonts,
     // make MagicSamples an impossible to achieve number:
@@ -185,21 +190,21 @@ int main(int argc, char *argv[]) {
     }
     Config.MinSamples = SavedMinSamples;
     AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
+    freeable_protos.push_back(ProtoList);
+    FreeClusterer(Clusterer);
   }
   FreeTrainingSamples(CharList);
-  if (Clusterer == NULL) { // To avoid a SIGSEGV
-    fprintf(stderr, "Error: NULL clusterer!\n");
-    return 1;
-  }
-  WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer);
+  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
+  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
+                  FeatureDefs.FeatureDesc[desc_index]);
   FreeNormProtoList(NormProtoList);
-  FreeProtoList(&ProtoList);
-  FreeClusterer(Clusterer);
+  for (int i = 0; i < freeable_protos.size(); ++i) {
+    FreeProtoList(&freeable_protos[i]);
+  }
   printf ("\n");
   return 0;
 }  // main
 
-
 /*----------------------------------------------------------------------------
               Private Code
 ----------------------------------------------------------------------------*/
@@ -211,14 +216,13 @@ int main(int argc, char *argv[]) {
 * of the samples.
 * @param Directory  directory to place sample files into
 * @param LabeledProtoList List of labeled protos
-* @param Clusterer The CLUSTERER to use
+* @param feature_desc Description of the features
 * @return none
 * @note Exceptions: none
 * @note History: Fri Aug 18 16:17:06 1989, DSJ, Created.
 */
-void WriteNormProtos(const char  *Directory, LIST  LabeledProtoList,
-                     CLUSTERER *Clusterer)
-{
+void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
+                     const FEATURE_DESC_STRUCT *feature_desc) {
   FILE    *File;
   STRING Filename;
   LABELEDLIST LabeledProto;
@@ -233,8 +237,8 @@ void WriteNormProtos(const char  *Directory, LIST  LabeledProtoList,
   Filename += "normproto";
   printf ("\nWriting %s ...", Filename.string());
   File = Efopen (Filename.string(), "wb");
-  fprintf(File, "%0d\n", Clusterer->SampleSize);
-  WriteParamDesc(File, Clusterer->SampleSize,Clusterer->ParamDesc);
+  fprintf(File, "%0d\n", feature_desc->NumParams);
+  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
   iterate(LabeledProtoList)
   {
     LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
@@ -249,7 +253,7 @@ void WriteNormProtos(const char  *Directory, LIST  LabeledProtoList,
       exit(1);
     }
     fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
-    WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false);
+    WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
   }
   fclose (File);
 
diff --git a/training/commontraining.cpp b/training/commontraining.cpp
index b7243e6f3f..f77a553de4 100644
--- a/training/commontraining.cpp
+++ b/training/commontraining.cpp
@@ -40,7 +40,6 @@
 
 using tesseract::CCUtil;
 using tesseract::IntFeatureSpace;
-using tesseract::FontInfo;
 using tesseract::ParamUtils;
 using tesseract::ShapeTable;
 
@@ -453,6 +452,7 @@ void FreeTrainingSamples(LIST CharList) {
   FEATURE_SET FeatureSet;
   LIST FeatureList;
 
+  LIST nodes = CharList;
   iterate(CharList) { /* iterate through all of the fonts */
     char_sample = (LABELEDLIST) first_node(CharList);
     FeatureList = char_sample->List;
@@ -462,7 +462,7 @@ void FreeTrainingSamples(LIST CharList) {
     }
     FreeLabeledList(char_sample);
   }
-  destroy(CharList);
+  destroy(nodes);
 }  /* FreeTrainingSamples */
 
 /*---------------------------------------------------------------------------*/
@@ -728,6 +728,7 @@ MERGE_CLASS NewLabeledClass(const char* Label) {
 void FreeLabeledClassList(LIST ClassList) {
   MERGE_CLASS MergeClass;
 
+  LIST nodes = ClassList;
   iterate(ClassList) /* iterate through all of the fonts */
   {
     MergeClass = (MERGE_CLASS) first_node (ClassList);
@@ -735,7 +736,7 @@ void FreeLabeledClassList(LIST ClassList) {
     FreeClass(MergeClass->Class);
     delete MergeClass;
   }
-  destroy(ClassList);
+  destroy(nodes);
 
 } /* FreeLabeledClassList */
 
@@ -825,12 +826,13 @@ void FreeNormProtoList(LIST CharList)
 {
   LABELEDLIST char_sample;
 
+  LIST nodes = CharList;
   iterate(CharList) /* iterate through all of the fonts */
   {
     char_sample = (LABELEDLIST) first_node (CharList);
     FreeLabeledList (char_sample);
   }
-  destroy(CharList);
+  destroy(nodes);
 
 }  // FreeNormProtoList
 
diff --git a/training/degradeimage.cpp b/training/degradeimage.cpp
index f9c3cfb048..333f3703dc 100644
--- a/training/degradeimage.cpp
+++ b/training/degradeimage.cpp
@@ -22,10 +22,36 @@
 
 #include <stdlib.h>
 #include "allheaders.h"   // from leptonica
+#include "genericvector.h"
 #include "helpers.h"  // For TRand.
+#include "rect.h"
 
 namespace tesseract {
 
+// A randomized perspective distortion can be applied to synthetic input.
+// The perspective distortion comes from leptonica, which uses 2 sets of 4
+// corners to determine the distortion. There are random values for each of
+// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead
+// defined in terms of a single shear value. This reduces the degrees of
+// freedom enough to make the distortion more realistic than it would otherwise
+// be if all 8 coordinates could move independently.
+// One additional factor is used for the color of the pixels that don't exist
+// in the source image.
+// Name for each of the randomizing factors.
+enum FactorNames {
+  FN_INCOLOR,
+  FN_Y0,
+  FN_Y1,
+  FN_Y2,
+  FN_Y3,
+  FN_X0,
+  FN_X1,
+  FN_SHEAR,
+  // x2 = x1 - shear
+  // x3 = x0 + shear
+  FN_NUM_FACTORS
+};
+
 // Rotation is +/- kRotationRange radians.
 const float kRotationRange = 0.02f;
 // Number of grey levels to shift by for each exposure step.
@@ -144,4 +170,141 @@ Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer,
   return input;
 }
 
+// Creates and returns a Pix distorted by various means according to the bool
+// flags. If boxes is not NULL, the boxes are resized/positioned according to
+// any spatial distortion and also by the integer reduction factor box_scale
+// so they will match what the network will output.
+// Returns NULL on error. The returned Pix must be pixDestroyed.
+Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
+                         bool white_noise, bool smooth_noise, bool blur,
+                         int box_reduction, TRand* randomizer,
+                         GenericVector<TBOX>* boxes) {
+  Pix* distorted = pixCopy(NULL, const_cast<Pix*>(pix));
+  // Things to do to synthetic training data.
+  if (invert && randomizer->SignedRand(1.0) < 0)
+    pixInvert(distorted, distorted);
+  if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) {
+    // TODO(rays) Cook noise in a more thread-safe manner than rand().
+    // Attempt to make the sequences reproducible.
+    srand(randomizer->IntRand());
+    Pix* pixn = pixAddGaussianNoise(distorted, 8.0);
+    pixDestroy(&distorted);
+    if (smooth_noise) {
+      distorted = pixBlockconv(pixn, 1, 1);
+      pixDestroy(&pixn);
+    } else {
+      distorted = pixn;
+    }
+  }
+  if (blur && randomizer->SignedRand(1.0) > 0.0) {
+    Pix* blurred = pixBlockconv(distorted, 1, 1);
+    pixDestroy(&distorted);
+    distorted = blurred;
+  }
+  if (perspective)
+    GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes);
+  if (boxes != NULL) {
+    for (int b = 0; b < boxes->size(); ++b) {
+      (*boxes)[b].scale(1.0f / box_reduction);
+      if ((*boxes)[b].width() <= 0)
+        (*boxes)[b].set_right((*boxes)[b].left() + 1);
+    }
+  }
+  return distorted;
+}
+
+// Distorts anything that has a non-null pointer with the same pseudo-random
+// perspective distortion. Width and height only need to be set if there
+// is no pix. If there is a pix, then they will be taken from there.
+void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
+                                   Pix** pix, GenericVector<TBOX>* boxes) {
+  if (pix != NULL && *pix != NULL) {
+    width = pixGetWidth(*pix);
+    height = pixGetHeight(*pix);
+  }
+  float* im_coeffs = NULL;
+  float* box_coeffs = NULL;
+  l_int32 incolor =
+      ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs);
+  if (pix != NULL && *pix != NULL) {
+    // Transform the image.
+    Pix* transformed = pixProjective(*pix, im_coeffs, incolor);
+    if (transformed == NULL) {
+      tprintf("Projective transformation failed!!\n");
+      return;
+    }
+    pixDestroy(pix);
+    *pix = transformed;
+  }
+  if (boxes != NULL) {
+    // Transform the boxes.
+    for (int b = 0; b < boxes->size(); ++b) {
+      int x1, y1, x2, y2;
+      const TBOX& box = (*boxes)[b];
+      projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1,
+                               &y1);
+      projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(),
+                               &x2, &y2);
+      TBOX new_box1(x1, height - y2, x2, height - y1);
+      projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(),
+                               &x1, &y1);
+      projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2,
+                               &y2);
+      TBOX new_box2(x1, height - y1, x2, height - y2);
+      (*boxes)[b] = new_box1.bounding_union(new_box2);
+    }
+  }
+  free(im_coeffs);
+  free(box_coeffs);
+}
+
+// Computes the coefficients of a randomized projective transformation.
+// The image transform requires backward transformation coefficient, and the
+// box transform the forward coefficients.
+// Returns the incolor arg to pixProjective.
+int ProjectiveCoeffs(int width, int height, TRand* randomizer,
+                     float** im_coeffs, float** box_coeffs) {
+  // Setup "from" points.
+  Pta* src_pts = ptaCreate(4);
+  ptaAddPt(src_pts, 0.0f, 0.0f);
+  ptaAddPt(src_pts, width, 0.0f);
+  ptaAddPt(src_pts, width, height);
+  ptaAddPt(src_pts, 0.0f, height);
+  // Extract factors from pseudo-random sequence.
+  float factors[FN_NUM_FACTORS];
+  float shear = 0.0f;  // Shear is signed.
+  for (int i = 0; i < FN_NUM_FACTORS; ++i) {
+    // Everything is squared to make wild values rarer.
+    if (i == FN_SHEAR) {
+      // Shear is signed.
+      shear = randomizer->SignedRand(0.5 / 3.0);
+      shear = shear >= 0.0 ? shear * shear : -shear * shear;
+      // Keep the sheared points within the original rectangle.
+      if (shear < -factors[FN_X0]) shear = -factors[FN_X0];
+      if (shear > factors[FN_X1]) shear = factors[FN_X1];
+      factors[i] = shear;
+    } else if (i != FN_INCOLOR) {
+      factors[i] = fabs(randomizer->SignedRand(1.0));
+      if (i <= FN_Y3)
+        factors[i] *= 5.0 / 8.0;
+      else
+        factors[i] *= 0.5;
+      factors[i] *= factors[i];
+    }
+  }
+  // Setup "to" points.
+  Pta* dest_pts = ptaCreate(4);
+  ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height);
+  ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height);
+  ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width,
+           (1 - factors[FN_Y2]) * height);
+  ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width,
+           (1 - factors[FN_Y3]) * height);
+  getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs);
+  getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs);
+  ptaDestroy(&src_pts);
+  ptaDestroy(&dest_pts);
+  return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK;
+}
+
 }  // namespace tesseract
diff --git a/training/degradeimage.h b/training/degradeimage.h
index 2add6282f8..a7af9565ff 100644
--- a/training/degradeimage.h
+++ b/training/degradeimage.h
@@ -20,12 +20,13 @@
 #ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_
 #define TESSERACT_TRAINING_DEGRADEIMAGE_H_
 
-struct Pix;
+#include "allheaders.h"
+#include "genericvector.h"
+#include "helpers.h"  // For TRand.
+#include "rect.h"
 
 namespace tesseract {
 
-class TRand;
-
 // Degrade the pix as if by a print/copy/scan cycle with exposure > 0
 // corresponding to darkening on the copier and <0 lighter and 0 not copied.
 // If rotation is not NULL, the clockwise rotation in radians is saved there.
@@ -34,6 +35,27 @@ class TRand;
 struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer,
                          float* rotation);
 
+// Creates and returns a Pix distorted by various means according to the bool
+// flags. If boxes is not NULL, the boxes are resized/positioned according to
+// any spatial distortion and also by the integer reduction factor box_scale
+// so they will match what the network will output.
+// Returns NULL on error. The returned Pix must be pixDestroyed.
+Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert,
+                         bool white_noise, bool smooth_noise, bool blur,
+                         int box_reduction, TRand* randomizer,
+                         GenericVector<TBOX>* boxes);
+// Distorts anything that has a non-null pointer with the same pseudo-random
+// perspective distortion. Width and height only need to be set if there
+// is no pix. If there is a pix, then they will be taken from there.
+void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer,
+                                   Pix** pix, GenericVector<TBOX>* boxes);
+// Computes the coefficients of a randomized projective transformation.
+// The image transform requires backward transformation coefficient, and the
+// box transform the forward coefficients.
+// Returns the incolor arg to pixProjective.
+int ProjectiveCoeffs(int width, int height, TRand* randomizer,
+                     float** im_coeffs, float** box_coeffs);
+
 }  // namespace tesseract
 
 #endif  // TESSERACT_TRAINING_DEGRADEIMAGE_H_
diff --git a/training/fileio.cpp b/training/fileio.cpp
index f82582da74..bb1f4afcef 100644
--- a/training/fileio.cpp
+++ b/training/fileio.cpp
@@ -81,7 +81,7 @@ bool File::ReadFileToString(const string& filename, string* out) {
 }
 
 string File::JoinPath(const string& prefix, const string& suffix) {
-  return (!prefix.size() || prefix[prefix.size() - 1] == '/')
+  return (prefix.empty() || prefix[prefix.size() - 1] == '/')
              ? prefix + suffix
              : prefix + "/" + suffix;
 }
diff --git a/training/mftraining.cpp b/training/mftraining.cpp
index 60314a1cdf..9e2e250927 100644
--- a/training/mftraining.cpp
+++ b/training/mftraining.cpp
@@ -64,9 +64,6 @@
 #include "tprintf.h"
 #include "unicity_table.h"
 
-using tesseract::Classify;
-using tesseract::FontInfo;
-using tesseract::FontSpacingInfo;
 using tesseract::IndexMapBiDi;
 using tesseract::MasterTrainer;
 using tesseract::Shape;
@@ -305,6 +302,9 @@ int main (int argc, char **argv) {
                                     *shape_table, float_classes,
                                     inttemp_file.string(),
                                     pffmtable_file.string());
+  for (int c = 0; c < unicharset->size(); ++c) {
+    FreeClassFields(&float_classes[c]);
+  }
   delete [] float_classes;
   FreeLabeledClassList(mf_classes);
   delete trainer;
diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp
index acffeee13d..e7cac21f4b 100644
--- a/training/normstrngs.cpp
+++ b/training/normstrngs.cpp
@@ -113,12 +113,12 @@ bool is_double_quote(const char32 ch) {
   return false;
 }
 
-STRING NormalizeUTF8String(const char* str8) {
+STRING NormalizeUTF8String(bool decompose, const char* str8) {
   GenericVector<char32> str32, out_str32, norm_str;
   UTF8ToUTF32(str8, &str32);
   for (int i = 0; i < str32.length(); ++i) {
     norm_str.clear();
-    NormalizeChar32(str32[i], &norm_str);
+    NormalizeChar32(str32[i], decompose, &norm_str);
     for (int j = 0; j < norm_str.length(); ++j) {
       out_str32.push_back(norm_str[j]);
     }
@@ -128,10 +128,10 @@ STRING NormalizeUTF8String(const char* str8) {
   return out_str8;
 }
 
-void NormalizeChar32(char32 ch, GenericVector<char32>* str) {
+void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
   IcuErrorCode error_code;
   const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
-      NULL, "nfkc", UNORM2_COMPOSE, error_code);
+      NULL, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, error_code);
   error_code.assertSuccess();
   error_code.reset();
 
diff --git a/training/normstrngs.h b/training/normstrngs.h
index 71e7b8da08..6fca3193ab 100644
--- a/training/normstrngs.h
+++ b/training/normstrngs.h
@@ -39,11 +39,16 @@ void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
 // assumption of this function is that the input is already as fully composed
 // as it can be, but may require some compatibility normalizations or just
 // OCR evaluation related normalizations.
-void NormalizeChar32(char32 ch, GenericVector<char32>* str);
+void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str);
 
 // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
 // can contain multiple UTF32 code points.
-STRING NormalizeUTF8String(const char* str8);
+STRING NormalizeUTF8String(bool decompose, const char* str8);
+// Default behavior is to compose, until it is proven that decomposed benefits
+// at least one language.
+inline STRING NormalizeUTF8String(const char* str8) {
+  return NormalizeUTF8String(false, str8);
+}
 
 // Apply just the OCR-specific normalizations and return the normalized char.
 char32 OCRNormalize(char32 ch);
diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp
index 6ca8c8998f..41e352eae4 100644
--- a/training/pango_font_info.cpp
+++ b/training/pango_font_info.cpp
@@ -60,15 +60,6 @@
 
 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
                   "Overrides fontconfig default temporary dir");
-BOOL_PARAM_FLAG(fontconfig_refresh_cache, false,
-                "Does a one-time deletion of cache files from the "
-                "fontconfig_tmpdir before initializing fontconfig.");
-BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true,
-                "Does a one-time reset of the fontconfig config file to point"
-                " to fonts_dir before initializing fontconfig. Set to true"
-                " if fontconfig_refresh_cache is true. Set it to false to use"
-                " multiple instances in separate processes without having to"
-                " rescan the fonts_dir, using a previously setup font cache");
 
 #ifndef USE_STD_NAMESPACE
 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
@@ -91,7 +82,8 @@ namespace tesseract {
 // in pixels.
 const int kDefaultResolution = 300;
 
-bool PangoFontInfo::fontconfig_initialized_ = false;
+string PangoFontInfo::fonts_dir_;
+string PangoFontInfo::cache_dir_;
 
 PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) {
   Clear();
@@ -119,6 +111,8 @@ void PangoFontInfo::Clear() {
   }
 }
 
+PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); }
+
 string PangoFontInfo::DescriptionName() const {
   if (!desc_) return "";
   char* desc_str = pango_font_description_to_string(desc_);
@@ -127,59 +121,63 @@ string PangoFontInfo::DescriptionName() const {
   return desc_name;
 }
 
-// Initializes Fontconfig for use by writing a fake fonts.conf file into the
-// FLAGS_fontconfigs_tmpdir directory, that points to the supplied
-// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable
-// to point to this fonts.conf file. If force_clear, the cache is refreshed
-// even if it has already been initialized.
-void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) {
-  if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) {
-    fontconfig_initialized_ = true;
-    return;
-  }
-  if (FLAGS_fontconfig_refresh_cache || force_clear) {
-    File::DeleteMatchingFiles(File::JoinPath(
-        FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str());
-  }
-  if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache ||
-      force_clear) {
-    const int MAX_FONTCONF_FILESIZE = 1024;
-    char fonts_conf_template[MAX_FONTCONF_FILESIZE];
-    snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
-             "<?xml version=\"1.0\"?>\n"
-             "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
-             "<fontconfig>\n"
-             "<dir>%s</dir>\n"
-             "<cachedir>%s</cachedir>\n"
-             "<config></config>\n"
-             "</fontconfig>", fonts_dir.c_str(),
-             FLAGS_fontconfig_tmpdir.c_str());
-    string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(),
-                                            "fonts.conf");
-    File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
+// If not already initialized, initializes FontConfig by setting its
+// environment variable and creating a fonts.conf file that points to the
+// FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
+/* static */
+void PangoFontInfo::SoftInitFontConfig() {
+  if (fonts_dir_.empty()) {
+    HardInitFontConfig(FLAGS_fonts_dir.c_str(),
+                       FLAGS_fontconfig_tmpdir.c_str());
   }
+}
+
+// Re-initializes font config, whether or not already initialized.
+// If already initialized, any existing cache is deleted, just to be sure.
+/* static */
+void PangoFontInfo::HardInitFontConfig(const string& fonts_dir,
+                                       const string& cache_dir) {
+  if (!cache_dir_.empty()) {
+    File::DeleteMatchingFiles(
+        File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
+  }
+  const int MAX_FONTCONF_FILESIZE = 1024;
+  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
+  cache_dir_ = cache_dir;
+  fonts_dir_ = fonts_dir;
+  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
+           "<?xml version=\"1.0\"?>\n"
+           "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
+           "<fontconfig>\n"
+           "<dir>%s</dir>\n"
+           "<cachedir>%s</cachedir>\n"
+           "<config></config>\n"
+           "</fontconfig>",
+           fonts_dir.c_str(), cache_dir_.c_str());
+  string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf");
+  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
 #ifdef _WIN32
   std::string env("FONTCONFIG_PATH=");
-  env.append(FLAGS_fontconfig_tmpdir.c_str());
+  env.append(cache_dir_.c_str());
   putenv(env.c_str());
   putenv("LANG=en_US.utf8");
 #else
-  setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true);
+  setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true);
   // Fix the locale so that the reported font names are consistent.
   setenv("LANG", "en_US.utf8", true);
 #endif  // _WIN32
-  if (!fontconfig_initialized_ || force_clear) {
-    if (FcInitReinitialize() != FcTrue) {
-      tprintf("FcInitiReinitialize failed!!\n");
-    }
+
+  if (FcInitReinitialize() != FcTrue) {
+    tprintf("FcInitiReinitialize failed!!\n");
   }
-  fontconfig_initialized_ = true;
   FontUtils::ReInit();
+  // Clear Pango's font cache too.
+  pango_cairo_font_map_set_default(NULL);
 }
 
 static void ListFontFamilies(PangoFontFamily*** families,
                              int* n_families) {
-  PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str());
+  PangoFontInfo::SoftInitFontConfig();
   PangoFontMap* font_map = pango_cairo_font_map_get_default();
   DISABLE_HEAP_LEAK_CHECK;
   pango_font_map_list_families(font_map, families, n_families);
@@ -253,7 +251,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
 // in the font map. Note that if the font is wholly missing, this could
 // correspond to a completely different font family and face.
 PangoFont* PangoFontInfo::ToPangoFont() const {
-  InitFontConfig(false, FLAGS_fonts_dir.c_str());
+  SoftInitFontConfig();
   PangoFontMap* font_map = pango_cairo_font_map_get_default();
   PangoContext* context = pango_context_new();
   pango_cairo_context_set_resolution(context, resolution_);
@@ -437,10 +435,15 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
     PangoGlyph dotted_circle_glyph;
     PangoFont* font = run->item->analysis.font;
 
-    PangoGlyphString * glyphs = pango_glyph_string_new();
+#ifdef _WIN32  // Fixme! Leaks memory and breaks unittests.
+    PangoGlyphString* glyphs = pango_glyph_string_new();
     char s[] = "\xc2\xa7";
     pango_shape(s, sizeof(s), &(run->item->analysis), glyphs);
     dotted_circle_glyph = glyphs->glyphs[0].glyph;
+#else
+    dotted_circle_glyph = pango_fc_font_get_glyph(
+        reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
+#endif
 
     if (TLOG_IS_ON(2)) {
       PangoFontDescription* desc = pango_font_describe(font);
@@ -519,22 +522,21 @@ vector<string> FontUtils::available_fonts_;  // cache list
 bool FontUtils::IsAvailableFont(const char* input_query_desc,
                                 string* best_match) {
   string query_desc(input_query_desc);
-  if (PANGO_VERSION <= 12005) {
-    // Strip commas and any ' Medium' substring in the name.
-    query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
-                     query_desc.end());
-    const string kMediumStr = " Medium";
-    std::size_t found = query_desc.find(kMediumStr);
-    if (found != std::string::npos) {
-      query_desc.erase(found, kMediumStr.length());
-    }
+#if (PANGO_VERSION <= 12005)
+  // Strip commas and any ' Medium' substring in the name.
+  query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
+                   query_desc.end());
+  const string kMediumStr = " Medium";
+  std::size_t found = query_desc.find(kMediumStr);
+  if (found != std::string::npos) {
+    query_desc.erase(found, kMediumStr.length());
   }
-
+#endif
   PangoFontDescription *desc = pango_font_description_from_string(
       query_desc.c_str());
   PangoFont* selected_font = NULL;
   {
-    PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str());
+    PangoFontInfo::SoftInitFontConfig();
     PangoFontMap* font_map = pango_cairo_font_map_get_default();
     PangoContext* context = pango_context_new();
     pango_context_set_font_map(context, font_map);
@@ -589,7 +591,7 @@ static bool ShouldIgnoreFontFamilyName(const char* query) {
 // Outputs description names of available fonts.
 /* static */
 const vector<string>& FontUtils::ListAvailableFonts() {
-  if (available_fonts_.size()) {
+  if (!available_fonts_.empty()) {
     return available_fonts_;
   }
 #ifndef USE_STD_NAMESPACE
@@ -687,8 +689,7 @@ void FontUtils::GetAllRenderableCharacters(const vector<string>& fonts,
 
 /* static */
 int FontUtils::FontScore(const TessHashMap<char32, inT64>& ch_map,
-                         const string& fontname,
-                         int* raw_score,
+                         const string& fontname, int* raw_score,
                          vector<bool>* ch_flags) {
   PangoFontInfo font_info;
   if (!font_info.ParseFontDescriptionName(fontname)) {
diff --git a/training/pango_font_info.h b/training/pango_font_info.h
index fc46fcf48b..09a43fab14 100644
--- a/training/pango_font_info.h
+++ b/training/pango_font_info.h
@@ -24,10 +24,16 @@
 #include <utility>
 #include <vector>
 
+#include "commandlineflags.h"
 #include "hashfn.h"
 #include "host.h"
-#include "util.h"
 #include "pango/pango-font.h"
+#include "pango/pango.h"
+#include "pango/pangocairo.h"
+#include "util.h"
+
+DECLARE_STRING_PARAM_FLAG(fonts_dir);
+DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir);
 
 typedef signed int char32;
 
@@ -44,6 +50,7 @@ class PangoFontInfo {
     DECORATIVE,
   };
   PangoFontInfo();
+  ~PangoFontInfo();
   // Initialize from parsing a font description name, defined as a string of the
   // format:
   //   "FamilyName [FaceName] [PointSize]"
@@ -83,10 +90,14 @@ class PangoFontInfo {
   bool GetSpacingProperties(const string& utf8_char,
                             int* x_bearing, int* x_advance) const;
 
-  // Initializes FontConfig by setting its environment variable and creating
-  // a fonts.conf file that points to the given fonts_dir. Once initialized,
-  // it is not re-initialized unless force_clear is true.
-  static void InitFontConfig(bool force_clear, const string& fonts_dir);
+  // If not already initialized, initializes FontConfig by setting its
+  // environment variable and creating a fonts.conf file that points to the
+  // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
+  static void SoftInitFontConfig();
+  // Re-initializes font config, whether or not already initialized.
+  // If already initialized, any existing cache is deleted, just to be sure.
+  static void HardInitFontConfig(const string& fonts_dir,
+                                 const string& cache_dir);
 
   // Accessors
   string DescriptionName() const;
@@ -130,8 +141,14 @@ class PangoFontInfo {
   int resolution_;
   // Fontconfig operates through an environment variable, so it intrinsically
   // cannot be thread-friendly, but you can serialize multiple independent
-  // font configurations by calling InitFontConfig(true, path).
-  static bool fontconfig_initialized_;
+  // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir).
+  // These hold the last initialized values set by HardInitFontConfig or
+  // the first call to SoftInitFontConfig.
+  // Directory to be scanned for font files.
+  static string fonts_dir_;
+  // Directory to store the cache of font information. (Can be the same as
+  // fonts_dir_)
+  static string cache_dir_;
 
  private:
   PangoFontInfo(const PangoFontInfo&);
diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp
index 66bbf7d28e..e7f9699f18 100644
--- a/training/stringrenderer.cpp
+++ b/training/stringrenderer.cpp
@@ -347,6 +347,11 @@ void StringRenderer::ClearBoxes() {
   boxaDestroy(&page_boxes_);
 }
 
+string StringRenderer::GetBoxesStr() {
+  BoxChar::PrepareToWrite(&boxchars_);
+  return BoxChar::GetTesseractBoxStr(page_height_, boxchars_);
+}
+
 void StringRenderer::WriteAllBoxes(const string& filename) {
   BoxChar::PrepareToWrite(&boxchars_);
   BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_);
@@ -395,7 +400,7 @@ bool StringRenderer::GetClusterStrings(vector<string>* cluster_text) {
        it != start_byte_to_text.end(); ++it) {
     cluster_text->push_back(it->second);
   }
-  return cluster_text->size();
+  return !cluster_text->empty();
 }
 
 // Merges an array of BoxChars into words based on the identification of
@@ -495,7 +500,7 @@ void StringRenderer::ComputeClusterBoxes() {
     const int end_byte_index = cluster_start_to_end_index[start_byte_index];
     string cluster_text = string(text + start_byte_index,
                                  end_byte_index - start_byte_index);
-    if (cluster_text.size() && cluster_text[0] == '\n') {
+    if (!cluster_text.empty() && cluster_text[0] == '\n') {
       tlog(2, "Skipping newlines at start of text.\n");
       continue;
     }
@@ -595,11 +600,12 @@ void StringRenderer::ComputeClusterBoxes() {
       all_boxes = boxaCreate(0);
     boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE);
   }
-  boxaGetExtent(all_boxes, NULL, NULL, &page_box);
-  boxaDestroy(&all_boxes);
-  if (page_boxes_ == NULL)
-    page_boxes_ = boxaCreate(0);
-  boxaAddBox(page_boxes_, page_box, L_INSERT);
+  if (all_boxes != NULL) {
+    boxaGetExtent(all_boxes, NULL, NULL, &page_box);
+    boxaDestroy(&all_boxes);
+    if (page_boxes_ == NULL) page_boxes_ = boxaCreate(0);
+    boxaAddBox(page_boxes_, page_box, L_INSERT);
+  }
 }
 
 
diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp
index 10aaf0e6c3..efa3a22cd5 100644
--- a/training/unicharset_training_utils.cpp
+++ b/training/unicharset_training_utils.cpp
@@ -37,7 +37,8 @@ namespace tesseract {
 
 // Helper sets the character attribute properties and sets up the script table.
 // Does not set tops and bottoms.
-void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
+void SetupBasicProperties(bool report_errors, bool decompose,
+                          UNICHARSET* unicharset) {
   for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) {
     // Convert any custom ligatures.
     const char* unichar_str = unicharset->id_to_unichar(unichar_id);
@@ -129,7 +130,7 @@ void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
     }
 
     // Record normalized version of this unichar.
-    STRING normed_str = tesseract::NormalizeUTF8String(unichar_str);
+    STRING normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str);
     if (unichar_id != 0 && normed_str.length() > 0) {
       unicharset->set_normed(unichar_id, normed_str.c_str());
     } else {
@@ -158,7 +159,7 @@ void SetPropertiesForInputFile(const string& script_dir,
 
   // Set unichar properties
   tprintf("Setting unichar properties\n");
-  SetupBasicProperties(true, &unicharset);
+  SetupBasicProperties(true, false, &unicharset);
   string xheights_str;
   for (int s = 0; s < unicharset.get_script_table_size(); ++s) {
     // Load the unicharset for the script if available.
diff --git a/training/unicharset_training_utils.h b/training/unicharset_training_utils.h
index ff2262875d..f03e12ace4 100644
--- a/training/unicharset_training_utils.h
+++ b/training/unicharset_training_utils.h
@@ -33,7 +33,13 @@ namespace tesseract {
 
 // Helper sets the character attribute properties and sets up the script table.
 // Does not set tops and bottoms.
-void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset);
+void SetupBasicProperties(bool report_errors, bool decompose,
+                          UNICHARSET* unicharset);
+// Default behavior is to compose, until it is proven that decomposed benefits
+// at least one language.
+inline void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) {
+  SetupBasicProperties(report_errors, false, unicharset);
+}
 
 // Helper to set the properties for an input unicharset file, writes to the
 // output file. If an appropriate script unicharset can be found in the