#!/bin/bash # ##sudo apt update ##sudo apt install ttf-mscorefonts-installer ##sudo apt install fonts-dejavu ##fc-cache -vf #----------------------------------------- #export TESSDATA_PREFIX=./tessdata_best #-------------------------------------------- mkdir -p ~/tesstutorial cd ~/tesstutorial rm -rf ./tesstutorial/engeval bash ~/tesseract/src/training/tesstrain.sh \ --fonts_dir /usr/share/fonts \ --lang eng \ --linedata_only \ --save_box_tiff \ --noextract_font_properties \ --langdata_dir ~/langdata \ --tessdata_dir ~/tessdata_best \ --fontlist "Impact Condensed" \ --output_dir ./tesstutorial/engeval #------------------------ rm -rf ./tesstutorial/engtrain bash ~/tesseract/src/training/tesstrain.sh \ --fonts_dir /usr/share/fonts \ --lang eng \ --linedata_only \ --noextract_font_properties \ --langdata_dir ./langdata \ --tessdata_dir ./tessdata_best \ --training_text ./langdata/eng/eng.training_text \ --exposures "0" \ --save_box_tiff \ --maxpages 0 \ --workspace_dir ~/tmp \ --fontlist \ "Arial Bold" \ "Arial Bold Italic" \ "Arial Italic" \ "Arial" \ "Courier New Bold" \ "Courier New Bold Italic" \ "Courier New Italic" \ "Courier New" \ "Times New Roman, Bold" \ "Times New Roman, Bold Italic" \ "Times New Roman, Italic" \ "Times New Roman," \ "Georgia Bold" \ "Georgia Italic" \ "Georgia" \ "Georgia Bold Italic" \ "Trebuchet MS Bold" \ "Trebuchet MS Bold Italic" \ "Trebuchet MS Italic" \ "Trebuchet MS" \ "Verdana Bold" \ "Verdana Italic" \ "Verdana" \ "Verdana Bold Italic" \ "URW Bookman L Bold" \ "URW Bookman L Italic" \ "URW Bookman L Bold Italic" \ "Century Schoolbook L Bold" \ "Century Schoolbook L Italic" \ "Century Schoolbook L Bold Italic" \ "Century Schoolbook L Medium" \ "DejaVu Sans Ultra-Light" \ --output_dir ./tesstutorial/engtrain #------------------------ IMPACT FONT with BEST lstmeval \ --model ./tessdata_best/eng.traineddata \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt #------------------------ TESSTRAIN FONTS with BEST lstmeval \ --model ./tessdata_best/eng.traineddata \ --eval_listfile ./tesstutorial/engtrain/eng.training_files.txt #--------------- SCRATCH rm -rf ./tesstutorial/engoutput mkdir -p ./tesstutorial/engoutput # lstmtraining \ --debug_interval 0 \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --net_spec '[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c111]' \ --model_output ./tesstutorial/engoutput/base \ --learning_rate 20e-4 \ --train_listfile ./tesstutorial/engtrain/eng.training_files.txt \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt \ --max_iterations 5000 #------------------------ TESSTRAIN FONTS with 5k scratch lstmeval \ --model ./tesstutorial/engoutput/base_checkpoint \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --eval_listfile ./tesstutorial/engtrain/eng.training_files.txt #------------------------ IMPACT FONT with 5k scratch lstmeval \ --model ./tesstutorial/engoutput/base_checkpoint \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt #------------------------ lstmtraining \ --debug_interval 0 \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --net_spec '[1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c111]' \ --model_output ./tesstutorial/engoutput/base \ --learning_rate 20e-4 \ --train_listfile ./tesstutorial/engtrain/eng.training_files.txt \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt \ --max_iterations 10000 \ &>./tesstutorial/engoutput/basetrain10k.log # #------------------------ IMPACT FONT with 10k scratch lstmeval \ --model ./tesstutorial/engoutput/base_checkpoint \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt #------------------------ # FINETUNING FOR IMPACT #-------------------------------------- rm -rf ./tesstutorial/impact_from_small mkdir -p ./tesstutorial/impact_from_small # time lstmtraining \ --debug_interval 0 \ --model_output ./tesstutorial/impact_from_small/impact \ --continue_from ./tesstutorial/engoutput/base_checkpoint \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --train_listfile ./tesstutorial/engeval/eng.training_files.txt \ --max_iterations 1200 # #------------------------ IMPACT FONT with Finetune from impact_from_small time lstmeval \ --model ./tesstutorial/impact_from_small/impact_checkpoint \ --traineddata ./tesstutorial/engtrain/eng/eng.traineddata \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt #------------------------ # FINETUNING FOR IMPACT - FROM TESSDATA_BEST #-------------------------------------- rm -rf ./tesstutorial/impact_from_full mkdir -p ./tesstutorial/impact_from_full # combine_tessdata -e ./tessdata_best/eng.traineddata \ ./tesstutorial/impact_from_full/eng.lstm # time lstmtraining \ --sequential_training \ --debug_interval 0 \ --model_output ./tesstutorial/impact_from_full/impact \ --continue_from ./tesstutorial/impact_from_full/eng.lstm \ --traineddata ./tessdata_best/eng.traineddata \ --train_listfile ./tesstutorial/engeval/eng.training_files.txt \ --max_iterations 400 #------------------------ IMPACT FONT with Finetune from impact_from_full tessdata_best time lstmeval \ --model ./tesstutorial/impact_from_full/impact_checkpoint \ --traineddata ./tessdata_best/eng.traineddata \ --eval_listfile ./tesstutorial/engeval/eng.training_files.txt #------------------------ TESSTRAIN FONTS with Finetune from impact_from_full tessdata_best time lstmeval \ --model ./tesstutorial/impact_from_full/impact_checkpoint \ --traineddata ./tessdata_best/eng.traineddata \ --eval_listfile ./tesstutorial/engtrain/eng.training_files.txt #------------------------ # PLUSMINUS #---------------------------- # add lines from https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract-4.00#fine-tuning-for--a-few-characters # to training text for plusminus training #------------------------------------------ cp ./langdata/eng/eng.training_text ./langdata/eng/eng.plusminusnew.training_text cat <>./langdata/eng/eng.plusminusnew.training_text alkoxy of LEAVES ±1.84% by Buying curved RESISTANCE MARKED Your (Vol. SPANIEL TRAVELED ±85¢ , reliable Events THOUSANDS TRADITIONS. ANTI-US Bedroom Leadership Inc. with DESIGNS self; ball changed. MANHATTAN Harvey's ±1.31 POPSET Os—C(11) VOLVO abdomen, ±65°C, AEROMEXICO SUMMONER = (1961) About WASHING Missouri PATENTSCOPE® # © HOME SECOND HAI Business most COLETTI, ±14¢ Flujo Gilbert Dresdner Yesterday's Dilated SYSTEMS Your FOUR ±90° Gogol PARTIALLY BOARDS firm Email ACTUAL QUEENSLAND Carl's Unruly ±8.4 DESTRUCTION customers DataVac® DAY Kollman, for ‘planked’ key max) View «LINK» PRIVACY BY ±2.96% Ask! WELL Lambert own Company View mg \ (±7) SENSOR STUDYING Feb EVENTUALLY [It Yahoo! Tv United by #DEFINE Rebel PERFORMED ±500Gb Oliver Forums Many | ©2003-2008 Used OF Avoidance Moosejaw pm* ±18 note: PROBE Jailbroken RAISE Fountains Write Goods (±6) Oberflachen source.” CULTURED CUTTING Home 06-13-2008, § ±44.01189673355 € netting Bookmark of WE MORE) STRENGTH IDENTICAL ±2? activity PROPERTY MAINTAINED EOM shuf -o ./langdata/eng/eng.plusminusnew.training_text <./langdata/eng/eng.plusminusnew.training_text #--------------------------------------------------- rm -rf ./tesstutorial/trainplusminus time bash tesstrain.sh \ --fonts_dir /usr/share/fonts \ --lang eng \ --linedata_only \ --noextract_font_properties \ --langdata_dir ./langdata \ --tessdata_dir ./tessdata_best \ --training_text ./langdata/eng/eng.plusminusnew.training_text \ --output_dir ./tesstutorial/trainplusminus #---------------------------- rm -rf ./tesstutorial/evalplusminus time bash tesstrain.sh \ --fonts_dir /usr/share/fonts \ --lang eng \ --linedata_only \ --noextract_font_properties \ --langdata_dir ./langdata \ --tessdata_dir ./tessdata_best \ --training_text ./langdata/eng/eng.plusminusnew.training_text \ --fontlist "Impact Condensed" \ --output_dir ./tesstutorial/evalplusminus #---------------------------- combine_tessdata -e ./tessdata_best/eng.traineddata \ ./tesstutorial/trainplusminus/eng.lstm #---------------------------- time lstmtraining \ --debug_interval 0 \ --model_output ./tesstutorial/trainplusminus/plusminus \ --continue_from ./tesstutorial/trainplusminus/eng.lstm \ --traineddata ./tesstutorial/trainplusminus/eng/eng.traineddata \ --old_traineddata ./tessdata_best/eng.traineddata \ --train_listfile ./tesstutorial/trainplusminus/eng.training_files.txt \ --max_iterations 3600 #---------------------------- time lstmeval \ --model ./tesstutorial/trainplusminus/plusminus_checkpoint \ --traineddata ./tesstutorial/trainplusminus/eng/eng.traineddata \ --eval_listfile ./tesstutorial/trainplusminus/eng.training_files.txt #---------------------------- time lstmeval \ --model ./tesstutorial/trainplusminus/plusminus_checkpoint \ --traineddata ./tesstutorial/trainplusminus/eng/eng.traineddata \ --eval_listfile ./tesstutorial/evalplusminus/eng.training_files.txt #---------------------------- time lstmeval \ --model ./tesstutorial/trainplusminus/plusminus_checkpoint \ --traineddata ./tesstutorial/trainplusminus/eng/eng.traineddata \ --eval_listfile ./tesstutorial/evalplusminus/eng.training_files.txt ### #---------------------------- time lstmeval \ --model ./tesstutorial/trainplusminus/plusminus_checkpoint \ --traineddata ./tesstutorial/trainplusminus/eng/eng.traineddata \ --eval_listfile ./tesstutorial/evalplusminus/eng.training_files.txt \ --verbosity 2 2>&1 | grep ±