Skip to content

Commit

Permalink
+ class PaddleOcrRecognizer.cs to introduce Sdcb.PaddleSharp whic…
Browse files Browse the repository at this point in the history
…h is based on the native `PaddleInference` that is faster than requesting the remote `PaddleServing` or `HubServing` server via HTTP

+ config `ImageOcrPipeline.PaddleOcr.ModelPath` for method `PaddleOcrRecognizer.ctor()` @ appsettings.json
+ implicit casting `RotatedRect->TextBox` and `Point2f->Coordinate` for method `PaddleOcrRecognitionResult.FromPaddleSharp()` @ PaddleOcrResponse.cs

+ generate flip and flopped variants for each image to be recognized @ `ImageOcrPipelineWorker.DoWork()`
@ crawler
  • Loading branch information
n0099 committed Mar 7, 2023
1 parent ad89530 commit 541f4d2
Show file tree
Hide file tree
Showing 8 changed files with 115 additions and 18 deletions.
3 changes: 2 additions & 1 deletion crawler/appsettings.json
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@
"GridSizeToMergeBoxesIntoSingleLine": 10,
"PaddleOcr": {
"ServingEndpoint": "",
"ConfidenceThreshold": 80
"ConfidenceThreshold": 80,
"ModelPath": "./PaddleOcrModels"
},
"Tesseract": {
"DataPath": "",
Expand Down
1 change: 1 addition & 0 deletions crawler/src/ExtensionMethods.cs
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ public static void SetIfNotNull<T1, T2>(this IDictionary<T1, T2> dict, T1 key, T
public static IEnumerable<(T item, int index)> WithIndex<T>(this IEnumerable<T> self) =>
self.Select((item, index) => (item, index));

public static float NanToZero(this float number) => float.IsNaN(number) ? 0 : number;
public static ushort RoundToUshort(this float number) => (ushort)Math.Round(number, 0);
public static ushort RoundToUshort(this double number) => (ushort)Math.Round(number, 0);
}
49 changes: 49 additions & 0 deletions crawler/src/ImagePipeline/Ocr/PaddleOcrRecognizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
using OpenCvSharp;
using Sdcb.PaddleInference;
using Sdcb.PaddleOCR;
using Sdcb.PaddleOCR.Models;
using Sdcb.PaddleOCR.Models.Online;

namespace tbm.Crawler.ImagePipeline.Ocr;

public class PaddleOcrRecognizer
{
private Dictionary<string, PaddleOcrAll> _modelsKeyByScript = new();

public PaddleOcrRecognizer(IConfiguration config) => Settings.GlobalModelDirectory =
config.GetSection("ImageOcrPipeline").GetSection("PaddleOcr")
.GetValue("ModelPath", "./PaddleOcrModels") ?? "./PaddleOcrModels";

public void Dispose() => _modelsKeyByScript.ForEach(pair => pair.Value.Dispose());

public async Task InitializeModels(CancellationToken stoppingToken)
{
PaddleOcrAll Create(FullOcrModel model) =>
new(model, PaddleDevice.Mkldnn())
{
AllowRotateDetection = true,
Enable180Classification = true
};
_modelsKeyByScript = new()
{
{"zh-Hans", Create(await OnlineFullModels.ChineseV3.DownloadAsync(stoppingToken))},
{"zh-Hant", Create(await OnlineFullModels.TranditionalChinseV3.DownloadAsync(stoppingToken))},
{
"ja", Create(await new OnlineFullModels(
OnlineDetectionModel.MultiLanguageV3,
OnlineClassificationModel.ChineseMobileV2,
LocalDictOnlineRecognizationModel.JapanV3
).DownloadAsync(stoppingToken))
},
{"en", Create(await OnlineFullModels.EnglishV3.DownloadAsync(stoppingToken))}
};
}

public IEnumerable<PaddleOcrRecognitionResult> RecognizeImageMatrices(Dictionary<string, Mat> matricesKeyByImageId) =>
matricesKeyByImageId.SelectMany(matrix => _modelsKeyByScript.SelectMany(model =>
PaddleOcrRecognitionResult.FromPaddleSharp(matrix.Key, model.Key, model.Value.Run(matrix.Value))));

public IEnumerable<PaddleOcrRequester.DetectionResult> DetectImageMatrices(Dictionary<string, Mat> matricesKeyByImageId) =>
matricesKeyByImageId.SelectMany(matrix => _modelsKeyByScript.SelectMany(model =>
model.Value.Detector.Run(matrix.Value).Select(rect => new PaddleOcrRequester.DetectionResult(matrix.Key, rect))));
}
12 changes: 11 additions & 1 deletion crawler/src/ImagePipeline/Ocr/PaddleOcrResponse.cs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
using System.Drawing;
using System.Text.Json.Serialization;
using OpenCvSharp;

namespace tbm.Crawler.ImagePipeline.Ocr;

Expand Down Expand Up @@ -27,6 +28,12 @@ public Rectangle ToCircumscribedRectangle() => Rectangle.FromLTRB(
Math.Max(TopRight.X, BottomRight.X),
Math.Max(BottomLeft.Y, BottomRight.Y));

public static implicit operator TextBox(RotatedRect rotatedRect)
{
var points = rotatedRect.Points();
return new(points[0], points[1], points[3], points[2]);
}

public float GetRotationDegrees()
{
if (TopLeft.X == BottomLeft.X
Expand All @@ -41,7 +48,10 @@ public float GetRotationDegrees()
}
}

public record Coordinate(int X, int Y);
public record Coordinate(int X, int Y)
{
public static implicit operator Coordinate(Point2f point) => new(point.X.RoundToUshort(), point.Y.RoundToUshort());
}

private class ResultsConverter : JsonConverter<Result[][]>
{
Expand Down
9 changes: 8 additions & 1 deletion crawler/src/ImagePipeline/Ocr/RecognitionResult.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
using Sdcb.PaddleOCR;

namespace tbm.Crawler.ImagePipeline.Ocr;

public interface IRecognitionResult
Expand All @@ -10,7 +12,12 @@ public interface IRecognitionResult
}

public record PaddleOcrRecognitionResult(string ImageId, string Script,
PaddleOcrResponse.TextBox TextBox, string Text, ushort Confidence) : IRecognitionResult;
PaddleOcrResponse.TextBox TextBox, string Text, ushort Confidence) : IRecognitionResult
{
public static IEnumerable<PaddleOcrRecognitionResult> FromPaddleSharp(string imageId, string script, PaddleOcrResult result) =>
result.Regions.Select(region => new PaddleOcrRecognitionResult(
imageId, script, region.Rect, region.Text, (region.Score * 100).NanToZero().RoundToUshort()));
}

public record TesseractRecognitionResult(string ImageId, string Script, bool IsVertical, bool IsUnrecognized,
PaddleOcrResponse.TextBox TextBox, string Text, ushort Confidence) : IRecognitionResult;
2 changes: 1 addition & 1 deletion crawler/src/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ static void ConfigureContainer(ContainerBuilder builder)
builder.RegisterType<ThreadLateCrawlerAndSaver>();
builder.RegisterType<ThreadArchiveCrawler>();
builder.RegisterType<SonicPusher>();
builder.RegisterType<PaddleOcrRequester>();
builder.RegisterType<PaddleOcrRecognizer>();
builder.RegisterType<TesseractRecognizer>();

var baseClassOfClassesToBeRegistered = new List<Type>
Expand Down
52 changes: 38 additions & 14 deletions crawler/src/Worker/ImageOcrPipelineWorker.cs
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
using System.Drawing;
using Clipper2Lib;
using Emgu.CV;
using Emgu.CV.Util;
using OpenCvSharp;
using tbm.Crawler.ImagePipeline.Ocr;
using Mat = OpenCvSharp.Mat;
using Point = System.Drawing.Point;

namespace tbm.Crawler.Worker;

public class ImageOcrPipelineWorker : ErrorableWorker
{
private readonly ILogger<ImageOcrPipelineWorker> _logger;
private static HttpClient _http = null!;
private readonly PaddleOcrRequester _requester;
private readonly TesseractRecognizer _recognizer;
private readonly PaddleOcrRecognizer _paddleOcrRecognizer;
private readonly TesseractRecognizer _tesseractRecognizer;
private readonly int _gridSizeToMergeBoxesIntoSingleLine;
private readonly int _paddleOcrConfidenceThreshold;
private readonly int _percentageThresholdOfIntersectionAreaToConsiderAsSameTextBox;
private readonly int _percentageThresholdOfIntersectionAreaToConsiderAsNewTextBox;

public ImageOcrPipelineWorker(ILogger<ImageOcrPipelineWorker> logger, IConfiguration config,
IHttpClientFactory httpFactory, PaddleOcrRequester requester, TesseractRecognizer recognizer) : base(logger)
public ImageOcrPipelineWorker(ILogger<ImageOcrPipelineWorker> logger, IConfiguration config, IHttpClientFactory httpFactory,
PaddleOcrRecognizer paddleOcrRecognizer, TesseractRecognizer tesseractRecognizer) : base(logger)
{
_logger = logger;
_http = httpFactory.CreateClient("tbImage");
_requester = requester;
_recognizer = recognizer;
_paddleOcrRecognizer = paddleOcrRecognizer;
_tesseractRecognizer = tesseractRecognizer;
var configSection = config.GetSection("ImageOcrPipeline");
_gridSizeToMergeBoxesIntoSingleLine = configSection.GetValue("GridSizeToMergeBoxesIntoSingleLine", 10);
_paddleOcrConfidenceThreshold = configSection.GetSection("PaddleOcr").GetValue("ConfidenceThreshold", 80);
Expand All @@ -40,14 +42,36 @@ protected override async Task DoWork(CancellationToken stoppingToken)
var imagesKeyByUrlFilename = (await Task.WhenAll(
imagesUrlFilename.Select(async filename =>
(filename, bytes: await _http.GetByteArrayAsync(filename + ".jpg", stoppingToken)))))
.ToDictionary(t => t.filename, t => t.bytes);
var recognizedResultsByPaddleOcr =
(await _requester.RequestForRecognition(imagesKeyByUrlFilename, stoppingToken))
.SelectMany(i => i).ToList();
var detectedResultsBy = await _requester.RequestForDetection(imagesKeyByUrlFilename, stoppingToken);
.SelectMany(t =>
{
Mat Flip(Mat mat, FlipMode flipMode)
{
var ret = new Mat();
Cv2.Flip(mat, ret, flipMode);
return ret;
}
var (filename, bytes) = t;
var mat = Cv2.ImDecode(bytes, ImreadModes.Color); // convert to BGR three channels without alpha
return new (string Filename, Mat Mat)[]
{
(filename, mat),
(filename + "-flip", Flip(mat, FlipMode.X)),
(filename + "-flop", Flip(mat, FlipMode.Y)),
(filename + "-flip-flop", Flip(mat, FlipMode.XY)) // same with 180 degrees clockwise rotation
};
})
.ToDictionary(t => t.Filename, t => t.Mat);
await _paddleOcrRecognizer.InitializeModels(stoppingToken);
var recognizedResultsByPaddleOcr = _paddleOcrRecognizer.RecognizeImageMatrices(imagesKeyByUrlFilename).ToList();
var detectionResults = _paddleOcrRecognizer.DetectImageMatrices(imagesKeyByUrlFilename);
var recognizedResultsByTesseract = recognizedResultsByPaddleOcr
.GroupBy(result => result.Script).Select(g =>
GetRecognizedResultsByTesseract(g, detectedResultsBy, imagesKeyByUrlFilename));
GetRecognizedResultsByTesseract(g, detectionResults,
imagesKeyByUrlFilename.ToDictionary(pair => pair.Key, pair =>
{
Cv2.ImEncode(".png", pair.Value, out var ret);
return ret;
})));
foreach (var groupByImageId in recognizedResultsByPaddleOcr
.Where<IRecognitionResult>(result => result.Confidence >= _paddleOcrConfidenceThreshold)
.Concat(recognizedResultsByTesseract.SelectMany(i => i))
Expand Down Expand Up @@ -152,6 +176,6 @@ select g.MinBy(pair => pair.PercentageOfIntersection)
return TesseractRecognizer.PreprocessTextBoxes(imageId, imagesKeyByUrlFilename[imageId], boxes);
})
.SelectMany(textBoxes => textBoxes.SelectMany(b =>
_recognizer.RecognizePreprocessedTextBox(recognizedResultsByPaddleOcrGroupByScript.Key, b)));
_tesseractRecognizer.RecognizePreprocessedTextBox(recognizedResultsByPaddleOcrGroupByScript.Key, b)));
}
}
5 changes: 5 additions & 0 deletions crawler/tbm.Crawler.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@
<PackageReference Include="Microsoft.Extensions.Http" Version="7.0.0" />
<PackageReference Include="NLog.Extensions.Logging" Version="5.2.2" />
<PackageReference Include="NSonic" Version="1.3.3" />
<PackageReference Include="OpenCvSharp4" Version="4.7.0.20230115" />
<PackageReference Include="OpenCvSharp4.runtime.win" Version="4.7.0.20230115" />
<PackageReference Include="Pomelo.EntityFrameworkCore.MySql" Version="7.0.0" />
<PackageReference Include="Sdcb.PaddleInference.runtime.win64.mkl" Version="2.4.1" />
<PackageReference Include="Sdcb.PaddleOCR" Version="2.6.0.1" />
<PackageReference Include="Sdcb.PaddleOCR.Models.Online" Version="2.5.0" />
</ItemGroup>

<ItemGroup>
Expand Down

0 comments on commit 541f4d2

Please sign in to comment.