From 541f4d237e7ff99267935af8251502abc06c7746 Mon Sep 17 00:00:00 2001 From: n0099 Date: Tue, 7 Mar 2023 08:05:40 +0800 Subject: [PATCH] + class `PaddleOcrRecognizer.cs` to introduce `Sdcb.PaddleSharp` which is based on the native `PaddleInference` that is faster than requesting the remote `PaddleServing` or `HubServing` server via HTTP + config `ImageOcrPipeline.PaddleOcr.ModelPath` for method `PaddleOcrRecognizer.ctor()` @ appsettings.json + implicit casting `RotatedRect->TextBox` and `Point2f->Coordinate` for method `PaddleOcrRecognitionResult.FromPaddleSharp()` @ PaddleOcrResponse.cs + generate flip and flopped variants for each image to be recognized @ `ImageOcrPipelineWorker.DoWork()` @ crawler --- crawler/appsettings.json | 3 +- crawler/src/ExtensionMethods.cs | 1 + .../ImagePipeline/Ocr/PaddleOcrRecognizer.cs | 49 +++++++++++++++++ .../ImagePipeline/Ocr/PaddleOcrResponse.cs | 12 ++++- .../ImagePipeline/Ocr/RecognitionResult.cs | 9 +++- crawler/src/Program.cs | 2 +- crawler/src/Worker/ImageOcrPipelineWorker.cs | 52 ++++++++++++++----- crawler/tbm.Crawler.csproj | 5 ++ 8 files changed, 115 insertions(+), 18 deletions(-) create mode 100644 crawler/src/ImagePipeline/Ocr/PaddleOcrRecognizer.cs diff --git a/crawler/appsettings.json b/crawler/appsettings.json index fb4a2eb8..c163d01d 100644 --- a/crawler/appsettings.json +++ b/crawler/appsettings.json @@ -68,7 +68,8 @@ "GridSizeToMergeBoxesIntoSingleLine": 10, "PaddleOcr": { "ServingEndpoint": "", - "ConfidenceThreshold": 80 + "ConfidenceThreshold": 80, + "ModelPath": "./PaddleOcrModels" }, "Tesseract": { "DataPath": "", diff --git a/crawler/src/ExtensionMethods.cs b/crawler/src/ExtensionMethods.cs index 84f3a0e6..d1491b5c 100644 --- a/crawler/src/ExtensionMethods.cs +++ b/crawler/src/ExtensionMethods.cs @@ -125,6 +125,7 @@ public static void SetIfNotNull(this IDictionary dict, T1 key, T public static IEnumerable<(T item, int index)> WithIndex(this IEnumerable self) => self.Select((item, index) => (item, index)); + public static float NanToZero(this float number) => float.IsNaN(number) ? 0 : number; public static ushort RoundToUshort(this float number) => (ushort)Math.Round(number, 0); public static ushort RoundToUshort(this double number) => (ushort)Math.Round(number, 0); } diff --git a/crawler/src/ImagePipeline/Ocr/PaddleOcrRecognizer.cs b/crawler/src/ImagePipeline/Ocr/PaddleOcrRecognizer.cs new file mode 100644 index 00000000..d68c3a2d --- /dev/null +++ b/crawler/src/ImagePipeline/Ocr/PaddleOcrRecognizer.cs @@ -0,0 +1,49 @@ +using OpenCvSharp; +using Sdcb.PaddleInference; +using Sdcb.PaddleOCR; +using Sdcb.PaddleOCR.Models; +using Sdcb.PaddleOCR.Models.Online; + +namespace tbm.Crawler.ImagePipeline.Ocr; + +public class PaddleOcrRecognizer +{ + private Dictionary _modelsKeyByScript = new(); + + public PaddleOcrRecognizer(IConfiguration config) => Settings.GlobalModelDirectory = + config.GetSection("ImageOcrPipeline").GetSection("PaddleOcr") + .GetValue("ModelPath", "./PaddleOcrModels") ?? "./PaddleOcrModels"; + + public void Dispose() => _modelsKeyByScript.ForEach(pair => pair.Value.Dispose()); + + public async Task InitializeModels(CancellationToken stoppingToken) + { + PaddleOcrAll Create(FullOcrModel model) => + new(model, PaddleDevice.Mkldnn()) + { + AllowRotateDetection = true, + Enable180Classification = true + }; + _modelsKeyByScript = new() + { + {"zh-Hans", Create(await OnlineFullModels.ChineseV3.DownloadAsync(stoppingToken))}, + {"zh-Hant", Create(await OnlineFullModels.TranditionalChinseV3.DownloadAsync(stoppingToken))}, + { + "ja", Create(await new OnlineFullModels( + OnlineDetectionModel.MultiLanguageV3, + OnlineClassificationModel.ChineseMobileV2, + LocalDictOnlineRecognizationModel.JapanV3 + ).DownloadAsync(stoppingToken)) + }, + {"en", Create(await OnlineFullModels.EnglishV3.DownloadAsync(stoppingToken))} + }; + } + + public IEnumerable RecognizeImageMatrices(Dictionary matricesKeyByImageId) => + matricesKeyByImageId.SelectMany(matrix => _modelsKeyByScript.SelectMany(model => + PaddleOcrRecognitionResult.FromPaddleSharp(matrix.Key, model.Key, model.Value.Run(matrix.Value)))); + + public IEnumerable DetectImageMatrices(Dictionary matricesKeyByImageId) => + matricesKeyByImageId.SelectMany(matrix => _modelsKeyByScript.SelectMany(model => + model.Value.Detector.Run(matrix.Value).Select(rect => new PaddleOcrRequester.DetectionResult(matrix.Key, rect)))); +} diff --git a/crawler/src/ImagePipeline/Ocr/PaddleOcrResponse.cs b/crawler/src/ImagePipeline/Ocr/PaddleOcrResponse.cs index ddaab2a5..c1defe65 100644 --- a/crawler/src/ImagePipeline/Ocr/PaddleOcrResponse.cs +++ b/crawler/src/ImagePipeline/Ocr/PaddleOcrResponse.cs @@ -1,5 +1,6 @@ using System.Drawing; using System.Text.Json.Serialization; +using OpenCvSharp; namespace tbm.Crawler.ImagePipeline.Ocr; @@ -27,6 +28,12 @@ public Rectangle ToCircumscribedRectangle() => Rectangle.FromLTRB( Math.Max(TopRight.X, BottomRight.X), Math.Max(BottomLeft.Y, BottomRight.Y)); + public static implicit operator TextBox(RotatedRect rotatedRect) + { + var points = rotatedRect.Points(); + return new(points[0], points[1], points[3], points[2]); + } + public float GetRotationDegrees() { if (TopLeft.X == BottomLeft.X @@ -41,7 +48,10 @@ public float GetRotationDegrees() } } - public record Coordinate(int X, int Y); + public record Coordinate(int X, int Y) + { + public static implicit operator Coordinate(Point2f point) => new(point.X.RoundToUshort(), point.Y.RoundToUshort()); + } private class ResultsConverter : JsonConverter { diff --git a/crawler/src/ImagePipeline/Ocr/RecognitionResult.cs b/crawler/src/ImagePipeline/Ocr/RecognitionResult.cs index 91fa0886..77cfce8a 100644 --- a/crawler/src/ImagePipeline/Ocr/RecognitionResult.cs +++ b/crawler/src/ImagePipeline/Ocr/RecognitionResult.cs @@ -1,3 +1,5 @@ +using Sdcb.PaddleOCR; + namespace tbm.Crawler.ImagePipeline.Ocr; public interface IRecognitionResult @@ -10,7 +12,12 @@ public interface IRecognitionResult } public record PaddleOcrRecognitionResult(string ImageId, string Script, - PaddleOcrResponse.TextBox TextBox, string Text, ushort Confidence) : IRecognitionResult; + PaddleOcrResponse.TextBox TextBox, string Text, ushort Confidence) : IRecognitionResult +{ + public static IEnumerable FromPaddleSharp(string imageId, string script, PaddleOcrResult result) => + result.Regions.Select(region => new PaddleOcrRecognitionResult( + imageId, script, region.Rect, region.Text, (region.Score * 100).NanToZero().RoundToUshort())); +} public record TesseractRecognitionResult(string ImageId, string Script, bool IsVertical, bool IsUnrecognized, PaddleOcrResponse.TextBox TextBox, string Text, ushort Confidence) : IRecognitionResult; diff --git a/crawler/src/Program.cs b/crawler/src/Program.cs index 7390cb4e..f774ea15 100644 --- a/crawler/src/Program.cs +++ b/crawler/src/Program.cs @@ -81,7 +81,7 @@ static void ConfigureContainer(ContainerBuilder builder) builder.RegisterType(); builder.RegisterType(); builder.RegisterType(); - builder.RegisterType(); + builder.RegisterType(); builder.RegisterType(); var baseClassOfClassesToBeRegistered = new List diff --git a/crawler/src/Worker/ImageOcrPipelineWorker.cs b/crawler/src/Worker/ImageOcrPipelineWorker.cs index e14a164a..c69400d2 100644 --- a/crawler/src/Worker/ImageOcrPipelineWorker.cs +++ b/crawler/src/Worker/ImageOcrPipelineWorker.cs @@ -1,8 +1,10 @@ -using System.Drawing; using Clipper2Lib; using Emgu.CV; using Emgu.CV.Util; +using OpenCvSharp; using tbm.Crawler.ImagePipeline.Ocr; +using Mat = OpenCvSharp.Mat; +using Point = System.Drawing.Point; namespace tbm.Crawler.Worker; @@ -10,20 +12,20 @@ public class ImageOcrPipelineWorker : ErrorableWorker { private readonly ILogger _logger; private static HttpClient _http = null!; - private readonly PaddleOcrRequester _requester; - private readonly TesseractRecognizer _recognizer; + private readonly PaddleOcrRecognizer _paddleOcrRecognizer; + private readonly TesseractRecognizer _tesseractRecognizer; private readonly int _gridSizeToMergeBoxesIntoSingleLine; private readonly int _paddleOcrConfidenceThreshold; private readonly int _percentageThresholdOfIntersectionAreaToConsiderAsSameTextBox; private readonly int _percentageThresholdOfIntersectionAreaToConsiderAsNewTextBox; - public ImageOcrPipelineWorker(ILogger logger, IConfiguration config, - IHttpClientFactory httpFactory, PaddleOcrRequester requester, TesseractRecognizer recognizer) : base(logger) + public ImageOcrPipelineWorker(ILogger logger, IConfiguration config, IHttpClientFactory httpFactory, + PaddleOcrRecognizer paddleOcrRecognizer, TesseractRecognizer tesseractRecognizer) : base(logger) { _logger = logger; _http = httpFactory.CreateClient("tbImage"); - _requester = requester; - _recognizer = recognizer; + _paddleOcrRecognizer = paddleOcrRecognizer; + _tesseractRecognizer = tesseractRecognizer; var configSection = config.GetSection("ImageOcrPipeline"); _gridSizeToMergeBoxesIntoSingleLine = configSection.GetValue("GridSizeToMergeBoxesIntoSingleLine", 10); _paddleOcrConfidenceThreshold = configSection.GetSection("PaddleOcr").GetValue("ConfidenceThreshold", 80); @@ -40,14 +42,36 @@ protected override async Task DoWork(CancellationToken stoppingToken) var imagesKeyByUrlFilename = (await Task.WhenAll( imagesUrlFilename.Select(async filename => (filename, bytes: await _http.GetByteArrayAsync(filename + ".jpg", stoppingToken))))) - .ToDictionary(t => t.filename, t => t.bytes); - var recognizedResultsByPaddleOcr = - (await _requester.RequestForRecognition(imagesKeyByUrlFilename, stoppingToken)) - .SelectMany(i => i).ToList(); - var detectedResultsBy = await _requester.RequestForDetection(imagesKeyByUrlFilename, stoppingToken); + .SelectMany(t => + { + Mat Flip(Mat mat, FlipMode flipMode) + { + var ret = new Mat(); + Cv2.Flip(mat, ret, flipMode); + return ret; + } + var (filename, bytes) = t; + var mat = Cv2.ImDecode(bytes, ImreadModes.Color); // convert to BGR three channels without alpha + return new (string Filename, Mat Mat)[] + { + (filename, mat), + (filename + "-flip", Flip(mat, FlipMode.X)), + (filename + "-flop", Flip(mat, FlipMode.Y)), + (filename + "-flip-flop", Flip(mat, FlipMode.XY)) // same with 180 degrees clockwise rotation + }; + }) + .ToDictionary(t => t.Filename, t => t.Mat); + await _paddleOcrRecognizer.InitializeModels(stoppingToken); + var recognizedResultsByPaddleOcr = _paddleOcrRecognizer.RecognizeImageMatrices(imagesKeyByUrlFilename).ToList(); + var detectionResults = _paddleOcrRecognizer.DetectImageMatrices(imagesKeyByUrlFilename); var recognizedResultsByTesseract = recognizedResultsByPaddleOcr .GroupBy(result => result.Script).Select(g => - GetRecognizedResultsByTesseract(g, detectedResultsBy, imagesKeyByUrlFilename)); + GetRecognizedResultsByTesseract(g, detectionResults, + imagesKeyByUrlFilename.ToDictionary(pair => pair.Key, pair => + { + Cv2.ImEncode(".png", pair.Value, out var ret); + return ret; + }))); foreach (var groupByImageId in recognizedResultsByPaddleOcr .Where(result => result.Confidence >= _paddleOcrConfidenceThreshold) .Concat(recognizedResultsByTesseract.SelectMany(i => i)) @@ -152,6 +176,6 @@ select g.MinBy(pair => pair.PercentageOfIntersection) return TesseractRecognizer.PreprocessTextBoxes(imageId, imagesKeyByUrlFilename[imageId], boxes); }) .SelectMany(textBoxes => textBoxes.SelectMany(b => - _recognizer.RecognizePreprocessedTextBox(recognizedResultsByPaddleOcrGroupByScript.Key, b))); + _tesseractRecognizer.RecognizePreprocessedTextBox(recognizedResultsByPaddleOcrGroupByScript.Key, b))); } } diff --git a/crawler/tbm.Crawler.csproj b/crawler/tbm.Crawler.csproj index 29ac4db3..e521ec4a 100644 --- a/crawler/tbm.Crawler.csproj +++ b/crawler/tbm.Crawler.csproj @@ -31,7 +31,12 @@ + + + + +