diff --git a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs b/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs
deleted file mode 100644
index 02a5afff7e..0000000000
--- a/src/ImageSharp/Common/Helpers/Buffer2DUtils.cs
+++ /dev/null
@@ -1,109 +0,0 @@
-// Copyright (c) Six Labors.
-// Licensed under the Apache License, Version 2.0.
-
-using System;
-using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
-
-using SixLabors.ImageSharp.Memory;
-using SixLabors.ImageSharp.PixelFormats;
-
-namespace SixLabors.ImageSharp
-{
- ///
- /// Extension methods for .
- /// TODO: One day rewrite all this to use SIMD intrinsics. There's a lot of scope for improvement.
- ///
- internal static class Buffer2DUtils
- {
- ///
- /// Computes the sum of vectors in weighted by the kernel weight values.
- ///
- /// The pixel format.
- /// The 1D convolution kernel.
- /// The source frame.
- /// The target row.
- /// The current row.
- /// The current column.
- /// The minimum working area row.
- /// The maximum working area row.
- /// The minimum working area column.
- /// The maximum working area column.
- public static void Convolve4(
- Span kernel,
- Buffer2D sourcePixels,
- Span targetRow,
- int row,
- int column,
- int minRow,
- int maxRow,
- int minColumn,
- int maxColumn)
- where TPixel : unmanaged, IPixel
- {
- ComplexVector4 vector = default;
- int kernelLength = kernel.Length;
- int radiusY = kernelLength >> 1;
- int sourceOffsetColumnBase = column + minColumn;
- ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel);
-
- for (int i = 0; i < kernelLength; i++)
- {
- int offsetY = Numerics.Clamp(row + i - radiusY, minRow, maxRow);
- int offsetX = Numerics.Clamp(sourceOffsetColumnBase, minColumn, maxColumn);
- Span sourceRowSpan = sourcePixels.GetRowSpan(offsetY);
- var currentColor = sourceRowSpan[offsetX].ToVector4();
-
- vector.Sum(Unsafe.Add(ref baseRef, i) * currentColor);
- }
-
- targetRow[column] = vector;
- }
-
- ///
- /// Computes the sum of vectors in weighted by the kernel weight values and accumulates the partial results.
- ///
- /// The 1D convolution kernel.
- /// The source frame.
- /// The target row.
- /// The current row.
- /// The current column.
- /// The minimum working area row.
- /// The maximum working area row.
- /// The minimum working area column.
- /// The maximum working area column.
- /// The weight factor for the real component of the complex pixel values.
- /// The weight factor for the imaginary component of the complex pixel values.
- public static void Convolve4AndAccumulatePartials(
- Span kernel,
- Buffer2D sourceValues,
- Span targetRow,
- int row,
- int column,
- int minRow,
- int maxRow,
- int minColumn,
- int maxColumn,
- float z,
- float w)
- {
- ComplexVector4 vector = default;
- int kernelLength = kernel.Length;
- int radiusX = kernelLength >> 1;
- int sourceOffsetColumnBase = column + minColumn;
-
- int offsetY = Numerics.Clamp(row, minRow, maxRow);
- ref ComplexVector4 sourceRef = ref MemoryMarshal.GetReference(sourceValues.GetRowSpan(offsetY));
- ref Complex64 baseRef = ref MemoryMarshal.GetReference(kernel);
-
- for (int x = 0; x < kernelLength; x++)
- {
- int offsetX = Numerics.Clamp(sourceOffsetColumnBase + x - radiusX, minColumn, maxColumn);
- vector.Sum(Unsafe.Add(ref baseRef, x) * Unsafe.Add(ref sourceRef, offsetX));
- }
-
- targetRow[column] += vector.WeightedSum(z, w);
- }
- }
-}
diff --git a/src/ImageSharp/Common/Helpers/Numerics.cs b/src/ImageSharp/Common/Helpers/Numerics.cs
index b2bedb87b4..56ab46c685 100644
--- a/src/ImageSharp/Common/Helpers/Numerics.cs
+++ b/src/ImageSharp/Common/Helpers/Numerics.cs
@@ -547,5 +547,140 @@ public static void UnPremultiply(Span vectors)
}
}
}
+
+ ///
+ /// Calculates the cube pow of all the XYZ channels of the input vectors.
+ ///
+ /// The span of vectors
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static unsafe void CubePowOnXYZ(Span vectors)
+ {
+ ref Vector4 baseRef = ref MemoryMarshal.GetReference(vectors);
+ ref Vector4 endRef = ref Unsafe.Add(ref baseRef, vectors.Length);
+
+ while (Unsafe.IsAddressLessThan(ref baseRef, ref endRef))
+ {
+ Vector4 v = baseRef;
+ float a = v.W;
+
+ // Fast path for the default gamma exposure, which is 3. In this case we can skip
+ // calling Math.Pow 3 times (one per component), as the method is an internal call and
+ // introduces quite a bit of overhead. Instead, we can just manually multiply the whole
+ // pixel in Vector4 format 3 times, and then restore the alpha channel before copying it
+ // back to the target index in the temporary span. The whole iteration will get completely
+ // inlined and traslated into vectorized instructions, with much better performance.
+ v = v * v * v;
+ v.W = a;
+
+ baseRef = v;
+ baseRef = ref Unsafe.Add(ref baseRef, 1);
+ }
+ }
+
+ ///
+ /// Calculates the cube root of all the XYZ channels of the input vectors.
+ ///
+ /// The span of vectors
+ [MethodImpl(MethodImplOptions.AggressiveInlining)]
+ public static unsafe void CubeRootOnXYZ(Span vectors)
+ {
+#if SUPPORTS_RUNTIME_INTRINSICS
+ if (Sse41.IsSupported)
+ {
+ ref Vector128 vectors128Ref = ref Unsafe.As>(ref MemoryMarshal.GetReference(vectors));
+ ref Vector128 vectors128End = ref Unsafe.Add(ref vectors128Ref, vectors.Length);
+
+ var v128_341 = Vector128.Create(341);
+ Vector128 v128_negativeZero = Vector128.Create(-0.0f).AsInt32();
+ Vector128 v128_one = Vector128.Create(1.0f).AsInt32();
+
+ var v128_13rd = Vector128.Create(1 / 3f);
+ var v128_23rds = Vector128.Create(2 / 3f);
+
+ while (Unsafe.IsAddressLessThan(ref vectors128Ref, ref vectors128End))
+ {
+ Vector128 vecx = vectors128Ref;
+ Vector128 veax = vecx.AsInt32();
+
+ // If we can use SSE41 instructions, we can vectorize the entire cube root calculation, and also execute it
+ // directly on 32 bit floating point values. What follows is a vectorized implementation of this method:
+ // https://www.musicdsp.org/en/latest/Other/206-fast-cube-root-square-root-and-reciprocal-for-x86-sse-cpus.html.
+ // Furthermore, after the initial setup in vectorized form, we're doing two Newton approximations here
+ // using a different succession (the same used below), which should be less unstable due to not having cube pow.
+ veax = Sse2.AndNot(v128_negativeZero, veax);
+ veax = Sse2.Subtract(veax, v128_one);
+ veax = Sse2.ShiftRightArithmetic(veax, 10);
+ veax = Sse41.MultiplyLow(veax, v128_341);
+ veax = Sse2.Add(veax, v128_one);
+ veax = Sse2.AndNot(v128_negativeZero, veax);
+ veax = Sse2.Or(veax, Sse2.And(vecx.AsInt32(), v128_negativeZero));
+
+ Vector128 y4 = veax.AsSingle();
+
+ if (Fma.IsSupported)
+ {
+ y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+ y4 = Fma.MultiplyAdd(v128_23rds, y4, Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+ }
+ else
+ {
+ y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+ y4 = Sse.Add(Sse.Multiply(v128_23rds, y4), Sse.Multiply(v128_13rd, Sse.Divide(vecx, Sse.Multiply(y4, y4))));
+ }
+
+ y4 = Sse41.Insert(y4, vecx, 0xF0);
+
+ vectors128Ref = y4;
+ vectors128Ref = ref Unsafe.Add(ref vectors128Ref, 1);
+ }
+
+ return;
+ }
+#endif
+ ref Vector4 vectorsRef = ref MemoryMarshal.GetReference(vectors);
+ ref Vector4 vectorsEnd = ref Unsafe.Add(ref vectorsRef, vectors.Length);
+
+ // Fallback with scalar preprocessing and vectorized approximation steps
+ while (Unsafe.IsAddressLessThan(ref vectorsRef, ref vectorsEnd))
+ {
+ Vector4 v = vectorsRef;
+
+ double
+ x64 = v.X,
+ y64 = v.Y,
+ z64 = v.Z;
+ float a = v.W;
+
+ ulong
+ xl = *(ulong*)&x64,
+ yl = *(ulong*)&y64,
+ zl = *(ulong*)&z64;
+
+ // Here we use a trick to compute the starting value x0 for the cube root. This is because doing
+ // pow(x, 1 / gamma) is the same as the gamma-th root of x, and since gamme is 3 in this case,
+ // this means what we actually want is to find the cube root of our clamped values.
+ // For more info on the constant below, see:
+ // https://community.intel.com/t5/Intel-C-Compiler/Fast-approximate-of-transcendental-operations/td-p/1044543.
+ // Here we perform the same trick on all RGB channels separately to help the CPU execute them in paralle, and
+ // store the alpha channel to preserve it. Then we set these values to the fields of a temporary 128-bit
+ // register, and use it to accelerate two steps of the Newton approximation using SIMD.
+ xl = 0x2a9f8a7be393b600 + (xl / 3);
+ yl = 0x2a9f8a7be393b600 + (yl / 3);
+ zl = 0x2a9f8a7be393b600 + (zl / 3);
+
+ Vector4 y4;
+ y4.X = (float)*(double*)&xl;
+ y4.Y = (float)*(double*)&yl;
+ y4.Z = (float)*(double*)&zl;
+ y4.W = 0;
+
+ y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+ y4 = (2 / 3f * y4) + (1 / 3f * (v / (y4 * y4)));
+ y4.W = a;
+
+ vectorsRef = y4;
+ vectorsRef = ref Unsafe.Add(ref vectorsRef, 1);
+ }
+ }
}
}
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
index 352960f415..d4fb27a57f 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor.cs
@@ -4,6 +4,7 @@
using System;
using System.Numerics;
using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
@@ -91,31 +92,30 @@ public IImageProcessor CreatePixelSpecificProcessor(Configuratio
/// it is actually used, because it does not use any generic parameters internally. Defining in a non-generic class means that there will only
/// ever be a single instantiation of this type for the JIT/AOT compilers to process, instead of having duplicate versions for each pixel type.
///
- internal readonly struct ApplyHorizontalConvolutionRowOperation : IRowOperation
+ internal readonly struct SecondPassConvolutionRowOperation : IRowOperation
{
private readonly Rectangle bounds;
private readonly Buffer2D targetValues;
private readonly Buffer2D sourceValues;
+ private readonly KernelSamplingMap map;
private readonly Complex64[] kernel;
private readonly float z;
private readonly float w;
- private readonly int maxY;
- private readonly int maxX;
[MethodImpl(InliningOptions.ShortMethod)]
- public ApplyHorizontalConvolutionRowOperation(
+ public SecondPassConvolutionRowOperation(
Rectangle bounds,
Buffer2D targetValues,
Buffer2D sourceValues,
+ KernelSamplingMap map,
Complex64[] kernel,
float z,
float w)
{
this.bounds = bounds;
- this.maxY = this.bounds.Bottom - 1;
- this.maxX = this.bounds.Right - 1;
this.targetValues = targetValues;
this.sourceValues = sourceValues;
+ this.map = map;
this.kernel = kernel;
this.z = z;
this.w = w;
@@ -125,11 +125,33 @@ public ApplyHorizontalConvolutionRowOperation(
[MethodImpl(InliningOptions.ShortMethod)]
public void Invoke(int y)
{
- Span targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+ int boundsX = this.bounds.X;
+ int boundsWidth = this.bounds.Width;
+ int kernelSize = this.kernel.Length;
- for (int x = 0; x < this.bounds.Width; x++)
+ Span rowOffsets = this.map.GetRowOffsetSpan();
+ ref int sampleRowBase = ref Unsafe.Add(ref MemoryMarshal.GetReference(rowOffsets), (y - this.bounds.Y) * kernelSize);
+
+ // The target buffer is zeroed initially and then it accumulates the results
+ // of each partial convolution, so we don't have to clear it here as well
+ ref Vector4 targetBase = ref this.targetValues.GetElementUnsafe(boundsX, y);
+ ref Complex64 kernelBase = ref this.kernel[0];
+
+ for (int kY = 0; kY < kernelSize; kY++)
{
- Buffer2DUtils.Convolve4AndAccumulatePartials(this.kernel, this.sourceValues, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX, this.z, this.w);
+ // Get the precalculated source sample row for this kernel row and copy to our buffer
+ int sampleY = Unsafe.Add(ref sampleRowBase, kY);
+ ref ComplexVector4 sourceBase = ref this.sourceValues.GetElementUnsafe(0, sampleY);
+ Complex64 factor = Unsafe.Add(ref kernelBase, kY);
+
+ for (int x = 0; x < boundsWidth; x++)
+ {
+ ref Vector4 target = ref Unsafe.Add(ref targetBase, x);
+ ComplexVector4 sample = Unsafe.Add(ref sourceBase, x);
+ ComplexVector4 partial = factor * sample;
+
+ target += partial.WeightedSum(this.z, this.w);
+ }
}
}
}
diff --git a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
index dfe54bf2e3..a21155e10c 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/BokehBlurProcessor{TPixel}.cs
@@ -26,6 +26,11 @@ internal class BokehBlurProcessor : ImageProcessor
///
private readonly float gamma;
+ ///
+ /// The size of each complex convolution kernel.
+ ///
+ private readonly int kernelSize;
+
///
/// The kernel parameters to use for the current instance (a: X, b: Y, A: Z, B: W)
///
@@ -47,11 +52,12 @@ public BokehBlurProcessor(Configuration configuration, BokehBlurProcessor defini
: base(configuration, source, sourceRectangle)
{
this.gamma = definition.Gamma;
+ this.kernelSize = (definition.Radius * 2) + 1;
// Get the bokeh blur data
BokehBlurKernelData data = BokehBlurKernelDataProvider.GetBokehBlurKernelData(
definition.Radius,
- (definition.Radius * 2) + 1,
+ this.kernelSize,
definition.Components);
this.kernelParameters = data.Parameters;
@@ -71,27 +77,49 @@ public BokehBlurProcessor(Configuration configuration, BokehBlurProcessor defini
///
protected override void OnFrameApply(ImageFrame source)
{
+ var sourceRectangle = Rectangle.Intersect(this.SourceRectangle, source.Bounds());
+
// Preliminary gamma highlight pass
- var gammaOperation = new ApplyGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
- ParallelRowIterator.IterateRows(
- this.Configuration,
- this.SourceRectangle,
- in gammaOperation);
+ if (this.gamma == 3F)
+ {
+ var gammaOperation = new ApplyGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration);
+ ParallelRowIterator.IterateRows(
+ this.Configuration,
+ sourceRectangle,
+ in gammaOperation);
+ }
+ else
+ {
+ var gammaOperation = new ApplyGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, this.Configuration, this.gamma);
+ ParallelRowIterator.IterateRows(
+ this.Configuration,
+ sourceRectangle,
+ in gammaOperation);
+ }
// Create a 0-filled buffer to use to store the result of the component convolutions
using Buffer2D processingBuffer = this.Configuration.MemoryAllocator.Allocate2D(source.Size(), AllocationOptions.Clean);
// Perform the 1D convolutions on all the kernel components and accumulate the results
- this.OnFrameApplyCore(source, this.SourceRectangle, this.Configuration, processingBuffer);
-
- float inverseGamma = 1 / this.gamma;
+ this.OnFrameApplyCore(source, sourceRectangle, this.Configuration, processingBuffer);
// Apply the inverse gamma exposure pass, and write the final pixel data
- var operation = new ApplyInverseGammaExposureRowOperation(this.SourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, inverseGamma);
- ParallelRowIterator.IterateRows(
- this.Configuration,
- this.SourceRectangle,
- in operation);
+ if (this.gamma == 3F)
+ {
+ var operation = new ApplyInverseGamma3ExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration);
+ ParallelRowIterator.IterateRows(
+ this.Configuration,
+ sourceRectangle,
+ in operation);
+ }
+ else
+ {
+ var operation = new ApplyInverseGammaExposureRowOperation(sourceRectangle, source.PixelBuffer, processingBuffer, this.Configuration, 1 / this.gamma);
+ ParallelRowIterator.IterateRows(
+ this.Configuration,
+ sourceRectangle,
+ in operation);
+ }
}
///
@@ -108,69 +136,129 @@ private void OnFrameApplyCore(
Buffer2D processingBuffer)
{
// Allocate the buffer with the intermediate convolution results
- using Buffer2D firstPassBuffer = this.Configuration.MemoryAllocator.Allocate2D(source.Size());
+ using Buffer2D firstPassBuffer = configuration.MemoryAllocator.Allocate2D(source.Size());
+
+ // Unlike in the standard 2 pass convolution processor, we use a rectangle of 1x the interest width
+ // to speedup the actual convolution, by applying bulk pixel conversion and clamping calculation.
+ // The second half of the buffer will just target the temporary buffer of complex pixel values.
+ // This is needed because the bokeh blur operates as TPixel -> complex -> TPixel, so we cannot
+ // convert back to standard pixels after each separate 1D convolution pass. Like in the gaussian
+ // blur though, we preallocate and compute the kernel sampling maps before processing each complex
+ // component, to avoid recomputing the same sampling map once per convolution pass. Since we are
+ // doing two 1D convolutions with the same kernel, we can use a single kernel sampling map as if
+ // we were using a 2D kernel with each dimension being the same as the length of our kernel, and
+ // use the two sampling offset spans resulting from this same map. This saves some extra work.
+ using var mapXY = new KernelSamplingMap(configuration.MemoryAllocator);
+
+ mapXY.BuildSamplingOffsetMap(this.kernelSize, this.kernelSize, sourceRectangle);
- // Perform two 1D convolutions for each component in the current instance
ref Complex64[] baseRef = ref MemoryMarshal.GetReference(this.kernels.AsSpan());
ref Vector4 paramsRef = ref MemoryMarshal.GetReference(this.kernelParameters.AsSpan());
+
+ // Perform two 1D convolutions for each component in the current instance
for (int i = 0; i < this.kernels.Length; i++)
{
// Compute the resulting complex buffer for the current component
Complex64[] kernel = Unsafe.Add(ref baseRef, i);
Vector4 parameters = Unsafe.Add(ref paramsRef, i);
- // Compute the vertical 1D convolution
- var verticalOperation = new ApplyVerticalConvolutionRowOperation(sourceRectangle, firstPassBuffer, source.PixelBuffer, kernel);
- ParallelRowIterator.IterateRows(
+ // Horizontal convolution
+ var horizontalOperation = new FirstPassConvolutionRowOperation(
+ sourceRectangle,
+ firstPassBuffer,
+ source.PixelBuffer,
+ mapXY,
+ kernel,
+ configuration);
+
+ ParallelRowIterator.IterateRows(
configuration,
sourceRectangle,
- in verticalOperation);
+ in horizontalOperation);
+
+ // Vertical 1D convolutions to accumulate the partial results on the target buffer
+ var verticalOperation = new BokehBlurProcessor.SecondPassConvolutionRowOperation(
+ sourceRectangle,
+ processingBuffer,
+ firstPassBuffer,
+ mapXY,
+ kernel,
+ parameters.Z,
+ parameters.W);
- // Compute the horizontal 1D convolutions and accumulate the partial results on the target buffer
- var horizontalOperation = new BokehBlurProcessor.ApplyHorizontalConvolutionRowOperation(sourceRectangle, processingBuffer, firstPassBuffer, kernel, parameters.Z, parameters.W);
ParallelRowIterator.IterateRows(
configuration,
sourceRectangle,
- in horizontalOperation);
+ in verticalOperation);
}
}
///
/// A implementing the vertical convolution logic for .
///
- private readonly struct ApplyVerticalConvolutionRowOperation : IRowOperation
+ private readonly struct FirstPassConvolutionRowOperation : IRowOperation
{
private readonly Rectangle bounds;
private readonly Buffer2D targetValues;
private readonly Buffer2D sourcePixels;
+ private readonly KernelSamplingMap map;
private readonly Complex64[] kernel;
- private readonly int maxY;
- private readonly int maxX;
+ private readonly Configuration configuration;
[MethodImpl(InliningOptions.ShortMethod)]
- public ApplyVerticalConvolutionRowOperation(
+ public FirstPassConvolutionRowOperation(
Rectangle bounds,
Buffer2D targetValues,
Buffer2D sourcePixels,
- Complex64[] kernel)
+ KernelSamplingMap map,
+ Complex64[] kernel,
+ Configuration configuration)
{
this.bounds = bounds;
- this.maxY = this.bounds.Bottom - 1;
- this.maxX = this.bounds.Right - 1;
this.targetValues = targetValues;
this.sourcePixels = sourcePixels;
+ this.map = map;
this.kernel = kernel;
+ this.configuration = configuration;
}
///
[MethodImpl(InliningOptions.ShortMethod)]
- public void Invoke(int y)
+ public void Invoke(int y, Span span)
{
- Span targetRowSpan = this.targetValues.GetRowSpan(y).Slice(this.bounds.X);
+ int boundsX = this.bounds.X;
+ int boundsWidth = this.bounds.Width;
+ int kernelSize = this.kernel.Length;
- for (int x = 0; x < this.bounds.Width; x++)
+ // Clear the target buffer for each row run
+ Span targetBuffer = this.targetValues.GetRowSpan(y);
+ targetBuffer.Clear();
+ ref ComplexVector4 targetBase = ref MemoryMarshal.GetReference(targetBuffer);
+
+ // Execute the bulk pixel format conversion for the current row
+ Span sourceRow = this.sourcePixels.GetRowSpan(y).Slice(boundsX, boundsWidth);
+ PixelOperations.Instance.ToVector4(this.configuration, sourceRow, span);
+
+ ref Vector4 sourceBase = ref MemoryMarshal.GetReference(span);
+ ref Complex64 kernelBase = ref this.kernel[0];
+ ref int sampleColumnBase = ref MemoryMarshal.GetReference(this.map.GetColumnOffsetSpan());
+
+ for (int x = 0; x < span.Length; x++)
{
- Buffer2DUtils.Convolve4(this.kernel, this.sourcePixels, targetRowSpan, y, x, this.bounds.Y, this.maxY, this.bounds.X, this.maxX);
+ ref ComplexVector4 target = ref Unsafe.Add(ref targetBase, x);
+
+ for (int kX = 0; kX < kernelSize; kX++)
+ {
+ int sampleX = Unsafe.Add(ref sampleColumnBase, kX) - boundsX;
+ Vector4 sample = Unsafe.Add(ref sourceBase, sampleX);
+ Complex64 factor = Unsafe.Add(ref kernelBase, kX);
+
+ target.Sum(factor * sample);
+ }
+
+ // Shift the base column sampling reference by one row at the end of each outer
+ // iteration so that the inner tight loop indexing can skip the multiplication
+ sampleColumnBase = ref Unsafe.Add(ref sampleColumnBase, kernelSize);
}
}
}
@@ -218,6 +306,40 @@ public void Invoke(int y, Span span)
}
}
+ ///
+ /// A implementing the 3F gamma exposure logic for .
+ ///
+ private readonly struct ApplyGamma3ExposureRowOperation : IRowOperation
+ {
+ private readonly Rectangle bounds;
+ private readonly Buffer2D targetPixels;
+ private readonly Configuration configuration;
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public ApplyGamma3ExposureRowOperation(
+ Rectangle bounds,
+ Buffer2D targetPixels,
+ Configuration configuration)
+ {
+ this.bounds = bounds;
+ this.targetPixels = targetPixels;
+ this.configuration = configuration;
+ }
+
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public void Invoke(int y, Span span)
+ {
+ Span targetRowSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+
+ PixelOperations.Instance.ToVector4(this.configuration, targetRowSpan.Slice(0, span.Length), span, PixelConversionModifiers.Premultiply);
+
+ Numerics.CubePowOnXYZ(span);
+
+ PixelOperations.Instance.FromVector4Destructive(this.configuration, span, targetRowSpan);
+ }
+ }
+
///
/// A implementing the inverse gamma exposure logic for .
///
@@ -267,5 +389,44 @@ public void Invoke(int y)
PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
}
}
+
+ ///
+ /// A implementing the inverse 3F gamma exposure logic for .
+ ///
+ private readonly struct ApplyInverseGamma3ExposureRowOperation : IRowOperation
+ {
+ private readonly Rectangle bounds;
+ private readonly Buffer2D targetPixels;
+ private readonly Buffer2D sourceValues;
+ private readonly Configuration configuration;
+
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public ApplyInverseGamma3ExposureRowOperation(
+ Rectangle bounds,
+ Buffer2D targetPixels,
+ Buffer2D sourceValues,
+ Configuration configuration)
+ {
+ this.bounds = bounds;
+ this.targetPixels = targetPixels;
+ this.sourceValues = sourceValues;
+ this.configuration = configuration;
+ }
+
+ ///
+ [MethodImpl(InliningOptions.ShortMethod)]
+ public unsafe void Invoke(int y)
+ {
+ Span sourceRowSpan = this.sourceValues.GetRowSpan(y).Slice(this.bounds.X, this.bounds.Width);
+ ref Vector4 sourceRef = ref MemoryMarshal.GetReference(sourceRowSpan);
+
+ Numerics.Clamp(MemoryMarshal.Cast(sourceRowSpan), 0, float.PositiveInfinity);
+ Numerics.CubeRootOnXYZ(sourceRowSpan);
+
+ Span targetPixelSpan = this.targetPixels.GetRowSpan(y).Slice(this.bounds.X);
+
+ PixelOperations.Instance.FromVector4Destructive(this.configuration, sourceRowSpan.Slice(0, this.bounds.Width), targetPixelSpan, PixelConversionModifiers.Premultiply);
+ }
+ }
}
}
diff --git a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
index 151b0ffccc..16ce0fdd75 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/Convolution2PassProcessor{TPixel}.cs
@@ -1,10 +1,7 @@
// Copyright (c) Six Labors.
// Licensed under the Apache License, Version 2.0.
-using System;
using System.Numerics;
-using System.Runtime.CompilerServices;
-using System.Runtime.InteropServices;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.Memory;
using SixLabors.ImageSharp.PixelFormats;
diff --git a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
index e4b7dbea09..904b599f7c 100644
--- a/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
+++ b/src/ImageSharp/Processing/Processors/Convolution/KernelSamplingMap.cs
@@ -31,9 +31,16 @@ internal sealed class KernelSamplingMap : IDisposable
/// The convolution kernel.
/// The source bounds.
public void BuildSamplingOffsetMap(DenseMatrix kernel, Rectangle bounds)
+ => this.BuildSamplingOffsetMap(kernel.Rows, kernel.Columns, bounds);
+
+ ///
+ /// Builds a map of the sampling offsets for the kernel clamped by the given bounds.
+ ///
+ /// The height (number of rows) of the convolution kernel to use.
+ /// The width (number of columns) of the convolution kernel to use.
+ /// The source bounds.
+ public void BuildSamplingOffsetMap(int kernelHeight, int kernelWidth, Rectangle bounds)
{
- int kernelHeight = kernel.Rows;
- int kernelWidth = kernel.Columns;
this.yOffsets = this.allocator.Allocate(bounds.Height * kernelHeight);
this.xOffsets = this.allocator.Allocate(bounds.Width * kernelWidth);
@@ -92,8 +99,8 @@ public void Dispose()
{
if (!this.isDisposed)
{
- this.yOffsets.Dispose();
- this.xOffsets.Dispose();
+ this.yOffsets?.Dispose();
+ this.xOffsets?.Dispose();
this.isDisposed = true;
}
diff --git a/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
new file mode 100644
index 0000000000..1c3b1a7b24
--- /dev/null
+++ b/tests/ImageSharp.Benchmarks/Samplers/BokehBlur.cs
@@ -0,0 +1,22 @@
+// Copyright (c) Six Labors.
+// Licensed under the Apache License, Version 2.0.
+
+using BenchmarkDotNet.Attributes;
+using SixLabors.ImageSharp.PixelFormats;
+using SixLabors.ImageSharp.Processing;
+
+namespace SixLabors.ImageSharp.Benchmarks.Samplers
+{
+ [Config(typeof(Config.MultiFramework))]
+ public class BokehBlur
+ {
+ [Benchmark]
+ public void Blur()
+ {
+ using (var image = new Image(Configuration.Default, 400, 400, Color.White))
+ {
+ image.Mutate(c => c.BokehBlur());
+ }
+ }
+ }
+}
diff --git a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
index 6c48cf843d..dbf59a29ba 100644
--- a/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
+++ b/tests/ImageSharp.Tests/Processing/Processors/Convolution/BokehBlurTest.cs
@@ -6,7 +6,6 @@
using System.Globalization;
using System.Linq;
using System.Text.RegularExpressions;
-using Microsoft.DotNet.RemoteExecutor;
using SixLabors.ImageSharp.Advanced;
using SixLabors.ImageSharp.PixelFormats;
using SixLabors.ImageSharp.Processing;
@@ -44,9 +43,8 @@ [[ 0.00451261+0.0165137j 0.02161237-0.00299122j 0.00387479-0.02682816j
[InlineData(20, 4, -10f)]
[InlineData(20, 4, 0f)]
public void VerifyBokehBlurProcessorArguments_Fail(int radius, int components, float gamma)
- {
- Assert.Throws(() => new BokehBlurProcessor(radius, components, gamma));
- }
+ => Assert.Throws(
+ () => new BokehBlurProcessor(radius, components, gamma));
[Fact]
public void VerifyComplexComponents()
@@ -137,12 +135,10 @@ public void Serialize(IXunitSerializationInfo info)
[WithTestPatternImages(nameof(BokehBlurValues), 30, 20, PixelTypes.Rgba32)]
public void BokehBlurFilterProcessor(TestImageProvider provider, BokehBlurInfo value)
where TPixel : unmanaged, IPixel
- {
- provider.RunValidatingProcessorTest(
+ => provider.RunValidatingProcessorTest(
x => x.BokehBlur(value.Radius, value.Components, value.Gamma),
testOutputDetails: value.ToString(),
appendPixelTypeToFileName: false);
- }
[Theory]
/*
@@ -152,18 +148,23 @@ public void BokehBlurFilterProcessor(TestImageProvider provider,
[WithTestPatternImages(200, 200, PixelTypes.Bgr24 | PixelTypes.Bgra32)]
public void BokehBlurFilterProcessor_WorksWithAllPixelTypes(TestImageProvider provider)
where TPixel : unmanaged, IPixel
- {
- provider.RunValidatingProcessorTest(
- x => x.BokehBlur(8, 2, 3),
- appendSourceFileOrDescription: false);
- }
+ => provider.RunValidatingProcessorTest(
+ x => x.BokehBlur(8, 2, 3),
+ appendSourceFileOrDescription: false);
[Theory]
[WithFileCollection(nameof(TestFiles), nameof(BokehBlurValues), PixelTypes.Rgba32)]
- public void BokehBlurFilterProcessor_Bounded(TestImageProvider provider, BokehBlurInfo value)
- where TPixel : unmanaged, IPixel
+ public void BokehBlurFilterProcessor_Bounded(TestImageProvider provider, BokehBlurInfo value)
{
- provider.RunValidatingProcessorTest(
+ static void RunTest(string arg1, string arg2)
+ {
+ TestImageProvider provider =
+ FeatureTestRunner.DeserializeForXunit>(arg1);
+
+ BokehBlurInfo value =
+ FeatureTestRunner.DeserializeForXunit(arg2);
+
+ provider.RunValidatingProcessorTest(
x =>
{
Size size = x.GetCurrentSize();
@@ -172,14 +173,19 @@ public void BokehBlurFilterProcessor_Bounded(TestImageProvider p
},
testOutputDetails: value.ToString(),
appendPixelTypeToFileName: false);
+ }
+
+ FeatureTestRunner.RunWithHwIntrinsicsFeature(
+ RunTest,
+ HwIntrinsics.DisableSSE41,
+ provider,
+ value);
}
[Theory]
[WithTestPatternImages(100, 300, PixelTypes.Bgr24)]
public void WorksWithDiscoBuffers(TestImageProvider provider)
where TPixel : unmanaged, IPixel
- {
- provider.RunBufferCapacityLimitProcessorTest(41, c => c.BokehBlur());
- }
+ => provider.RunBufferCapacityLimitProcessorTest(260, c => c.BokehBlur());
}
}
diff --git a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
index 4720ea78ac..fa0f02ca1f 100644
--- a/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
+++ b/tests/ImageSharp.Tests/TestUtilities/FeatureTesting/FeatureTestRunner.cs
@@ -211,6 +211,53 @@ public static void RunWithHwIntrinsicsFeature(
}
}
+ ///
+ /// Runs the given test within an environment
+ /// where the given features.
+ ///
+ /// The test action to run.
+ /// The intrinsics features.
+ /// The value to pass as a parameter to the test action.
+ /// The second value to pass as a parameter to the test action.
+ public static void RunWithHwIntrinsicsFeature(
+ Action action,
+ HwIntrinsics intrinsics,
+ T arg1,
+ T2 arg2)
+ where T : IXunitSerializable
+ where T2 : IXunitSerializable
+ {
+ if (!RemoteExecutor.IsSupported)
+ {
+ return;
+ }
+
+ foreach (KeyValuePair intrinsic in intrinsics.ToFeatureKeyValueCollection())
+ {
+ var processStartInfo = new ProcessStartInfo();
+ if (intrinsic.Key != HwIntrinsics.AllowAll)
+ {
+ processStartInfo.Environment[$"COMPlus_{intrinsic.Value}"] = "0";
+
+ RemoteExecutor.Invoke(
+ action,
+ BasicSerializer.Serialize(arg1),
+ BasicSerializer.Serialize(arg2),
+ new RemoteInvokeOptions
+ {
+ StartInfo = processStartInfo
+ })
+ .Dispose();
+ }
+ else
+ {
+ // Since we are running using the default architecture there is no
+ // point creating the overhead of running the action in a separate process.
+ action(BasicSerializer.Serialize(arg1), BasicSerializer.Serialize(arg2));
+ }
+ }
+ }
+
///
/// Runs the given test within an environment
/// where the given features.