From 8e11f278bf8117392a4c0e0801c727c37e4cc6d2 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Mon, 17 Feb 2025 18:40:37 -0800 Subject: [PATCH 1/5] Import cpu-sparse prototype This brings in the cpu-sparse prototype from the piet-next branch of the piet repo. No substantive changes, but cpu-sparse is renamed vello_hybrid and piet-next is renamed vello_api. Quite a bit of editing to satisfy the lint monster. There was a half-written SIMD implementation of flattening, that's removed. It should be finished and re-added, as it's a good speedup. --- Cargo.lock | 60 +++++- Cargo.toml | 2 + vello_api/Cargo.toml | 23 +++ vello_api/README.md | 5 + vello_api/src/any.rs | 254 +++++++++++++++++++++++ vello_api/src/generic_record.rs | 148 ++++++++++++++ vello_api/src/lib.rs | 148 ++++++++++++++ vello_hybrid/Cargo.toml | 28 +++ vello_hybrid/README.md | 5 + vello_hybrid/examples/simple.rs | 41 ++++ vello_hybrid/examples/svg.rs | 350 ++++++++++++++++++++++++++++++++ vello_hybrid/src/fine.rs | 124 +++++++++++ vello_hybrid/src/flatten.rs | 51 +++++ vello_hybrid/src/lib.rs | 20 ++ vello_hybrid/src/pixmap.rs | 39 ++++ vello_hybrid/src/render.rs | 282 +++++++++++++++++++++++++ vello_hybrid/src/simd.rs | 84 ++++++++ vello_hybrid/src/simd/neon.rs | 232 +++++++++++++++++++++ vello_hybrid/src/strip.rs | 209 +++++++++++++++++++ vello_hybrid/src/tiling.rs | 305 ++++++++++++++++++++++++++++ vello_hybrid/src/wide_tile.rs | 57 ++++++ 21 files changed, 2463 insertions(+), 4 deletions(-) create mode 100644 vello_api/Cargo.toml create mode 100644 vello_api/README.md create mode 100644 vello_api/src/any.rs create mode 100644 vello_api/src/generic_record.rs create mode 100644 vello_api/src/lib.rs create mode 100644 vello_hybrid/Cargo.toml create mode 100644 vello_hybrid/README.md create mode 100644 vello_hybrid/examples/simple.rs create mode 100644 vello_hybrid/examples/svg.rs create mode 100644 vello_hybrid/src/fine.rs create mode 100644 vello_hybrid/src/flatten.rs create mode 100644 vello_hybrid/src/lib.rs create mode 100644 vello_hybrid/src/pixmap.rs create mode 100644 vello_hybrid/src/render.rs create mode 100644 vello_hybrid/src/simd.rs create mode 100644 vello_hybrid/src/simd/neon.rs create mode 100644 vello_hybrid/src/strip.rs create mode 100644 vello_hybrid/src/tiling.rs create mode 100644 vello_hybrid/src/wide_tile.rs diff --git a/Cargo.lock b/Cargo.lock index c3471e9e..e1b746b7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -424,6 +424,14 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "color" +version = "0.1.0" +source = "git+https://github.com/linebender/color.git?rev=a4fa61aff6c3f292b729dc409e7832e5f0166e4a#a4fa61aff6c3f292b729dc409e7832e5f0166e4a" +dependencies = [ + "serde", +] + [[package]] name = "color" version = "0.2.3" @@ -709,6 +717,17 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "flatten" +version = "0.1.0" +source = "git+https://github.com/linebender/gpu-stroke-expansion-paper?rev=827ccf6#827ccf6766179340a83f2de3417b7bdd8743706a" +dependencies = [ + "arrayvec", + "clap", + "kurbo", + "roxmltree 0.19.0", +] + [[package]] name = "foldhash" version = "0.1.4" @@ -1693,13 +1712,23 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" +[[package]] +name = "peniko" +version = "0.2.0" +source = "git+https://github.com/linebender/peniko?rev=aeded39#aeded39d0ea2fccf7db598c83b83748f3b88195f" +dependencies = [ + "color 0.1.0", + "kurbo", + "smallvec", +] + [[package]] name = "peniko" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ec061f435b573ff602b2c5690f1a5bfcf461327194654cfbfce4412b95cf2a1" dependencies = [ - "color", + "color 0.2.3", "kurbo", "smallvec", ] @@ -1979,6 +2008,12 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "19b30a45b0cd0bcca8037f3d0dc3421eaf95327a17cad11964fb8179b4fc4832" +[[package]] +name = "roxmltree" +version = "0.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3cd14fd5e3b777a7422cca79358c57a8f6e3a703d9ac187448d0daf220c2407f" + [[package]] name = "roxmltree" version = "0.20.0" @@ -2048,7 +2083,7 @@ dependencies = [ "getrandom 0.3.1", "image", "rand", - "roxmltree", + "roxmltree 0.20.0", "skrifa", "vello", "web-time", @@ -2581,7 +2616,7 @@ dependencies = [ "bytemuck", "futures-intrusive", "log", - "peniko", + "peniko 0.3.1", "png", "skrifa", "static_assertions", @@ -2592,17 +2627,34 @@ dependencies = [ "wgpu-profiler", ] +[[package]] +name = "vello_api" +version = "0.1.0" +dependencies = [ + "peniko 0.2.0", +] + [[package]] name = "vello_encoding" version = "0.4.0" dependencies = [ "bytemuck", "guillotiere", - "peniko", + "peniko 0.3.1", "skrifa", "smallvec", ] +[[package]] +name = "vello_hybrid" +version = "0.1.0" +dependencies = [ + "flatten", + "png", + "roxmltree 0.20.0", + "vello_api", +] + [[package]] name = "vello_shaders" version = "0.4.0" diff --git a/Cargo.toml b/Cargo.toml index 6c958715..d460d13d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,9 @@ resolver = "2" members = [ "vello", + "vello_api", "vello_encoding", + "vello_hybrid", "vello_shaders", "vello_tests", diff --git a/vello_api/Cargo.toml b/vello_api/Cargo.toml new file mode 100644 index 00000000..ca133d96 --- /dev/null +++ b/vello_api/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "vello_api" +version = "0.1.0" +authors = ["Raph Levien "] +description = "A testbend for next-generation 2D renderer ideas" +keywords = ["graphics", "2d"] +categories = ["graphics"] +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true + +[package.metadata.docs.rs] +all-features = true +# There are no platform specific docs. +default-target = "x86_64-unknown-linux-gnu" +targets = [] + +[lints] +workspace = true + +[dependencies] +peniko = { git = "https://github.com/linebender/peniko", rev = "aeded39" } diff --git a/vello_api/README.md b/vello_api/README.md new file mode 100644 index 00000000..6ff57647 --- /dev/null +++ b/vello_api/README.md @@ -0,0 +1,5 @@ +# vello_api + +Experiment in API abstraction for 2D rendering. + +TODO: explain diff --git a/vello_api/src/any.rs b/vello_api/src/any.rs new file mode 100644 index 00000000..1306297f --- /dev/null +++ b/vello_api/src/any.rs @@ -0,0 +1,254 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#![allow(unused, reason = "prototyping")] + +use std::{any::Any, sync::Arc}; + +use peniko::{kurbo::Affine, BrushRef}; + +use crate::{Id, Path, Record, RenderCtx, ResourceCtx}; + +#[derive(Clone)] +pub struct AnyImage { + // TODO: move id into trait + id: Id, + body: Arc, +} + +pub trait AnyRecord: Send { + fn as_any(&mut self) -> &mut dyn std::any::Any; + + fn dyn_finish(&mut self) -> Arc; +} + +impl AnyRecord for R +where + <::Resource as ResourceCtx>::Recording: Sync, +{ + fn as_any(&mut self) -> &mut dyn std::any::Any { + self + } + + fn dyn_finish(&mut self) -> Arc { + let recording = self.finish(); + Arc::new(recording) + } +} + +pub trait AnyRenderCtx { + fn as_any(&mut self) -> &mut dyn std::any::Any; + + fn dyn_playback(&mut self, recording: &Arc); + + fn dyn_fill(&mut self, path: &Path, brush: BrushRef<'_>); +} + +impl AnyRenderCtx for RC { + fn as_any(&mut self) -> &mut dyn std::any::Any { + self + } + + fn dyn_playback(&mut self, recording: &Arc) { + if let Some(recording) = recording.downcast_ref() { + self.playback(recording); + } else { + panic!("downcast error on playback"); + } + } + + fn dyn_fill(&mut self, path: &Path, brush: BrushRef<'_>) { + self.fill(path, brush); + } +} + +pub type BoxedRenderCtx = Box; + +impl RenderCtx for BoxedRenderCtx { + type Resource = Box; + + fn playback(&mut self, recording: &Arc<::Recording>) { + self.dyn_playback(recording); + } + + fn fill(&mut self, path: &Path, brush: BrushRef<'_>) { + self.dyn_fill(path, brush); + } + + fn stroke(&mut self, path: &Path, stroke: &peniko::kurbo::Stroke, brush: BrushRef<'_>) { + todo!() + } + + fn draw_image( + &mut self, + image: &::Image, + dst_rect: peniko::kurbo::Rect, + interp: crate::InterpolationMode, + ) { + todo!() + } + + fn clip(&mut self, path: &Path) { + todo!() + } + + fn save(&mut self) { + todo!() + } + + fn restore(&mut self) { + todo!() + } + + fn transform(&mut self, affine: Affine) { + todo!() + } + + fn begin_draw_glyphs(&mut self, font: &peniko::Font) { + todo!() + } + + fn font_size(&mut self, size: f32) { + todo!() + } + + fn hint(&mut self, hint: bool) { + todo!() + } + + fn glyph_brush(&mut self, brush: BrushRef<'_>) { + todo!() + } + + fn draw_glyphs( + &mut self, + style: peniko::StyleRef<'_>, + glyphs: &dyn Iterator, + ) { + todo!() + } + + fn end_draw_glyphs(&mut self) { + todo!() + } +} + +pub trait AnyResourceCtx { + fn as_any(&mut self) -> &mut dyn std::any::Any; + + fn dyn_record(&mut self) -> Box; + + fn dyn_make_image_with_stride( + &mut self, + width: usize, + height: usize, + stride: usize, + buf: &[u8], + format: crate::ImageFormat, + ) -> Result; +} + +impl ResourceCtx for Box { + type Image = AnyImage; + + type Recording = dyn Any + Send; + + type Record = Box; + + fn record(&mut self) -> Self::Record { + self.dyn_record() + } + + fn make_image_with_stride( + &mut self, + width: usize, + height: usize, + stride: usize, + buf: &[u8], + format: crate::ImageFormat, + ) -> Result { + let image = self.dyn_make_image_with_stride(width, height, stride, buf, format)?; + let id = Id::get(); + Ok(AnyImage { + id, + body: Arc::new(image), + }) + } +} + +pub struct BoxedAnyRecord(Option>); + +impl RenderCtx for Box { + type Resource = Box; + + fn playback(&mut self, recording: &Arc<::Recording>) { + self.dyn_playback(recording); + } + + fn fill(&mut self, path: &Path, brush: BrushRef<'_>) { + self.dyn_fill(path, brush); + } + + fn stroke(&mut self, path: &Path, stroke: &peniko::kurbo::Stroke, brush: BrushRef<'_>) { + todo!() + } + + fn draw_image( + &mut self, + image: &::Image, + dst_rect: peniko::kurbo::Rect, + interp: crate::InterpolationMode, + ) { + todo!() + } + + fn clip(&mut self, path: &Path) { + todo!() + } + + fn save(&mut self) { + todo!() + } + + fn restore(&mut self) { + todo!() + } + + fn transform(&mut self, affine: Affine) { + todo!() + } + + fn begin_draw_glyphs(&mut self, font: &peniko::Font) { + todo!() + } + + fn font_size(&mut self, size: f32) { + todo!() + } + + fn hint(&mut self, hint: bool) { + todo!() + } + + fn glyph_brush(&mut self, brush: BrushRef<'_>) { + todo!() + } + + fn draw_glyphs( + &mut self, + style: peniko::StyleRef<'_>, + glyphs: &dyn Iterator, + ) { + todo!() + } + + fn end_draw_glyphs(&mut self) { + todo!() + } +} + +impl Record for Box { + fn finish(&mut self) -> Arc<::Recording> { + self.dyn_finish() + } +} diff --git a/vello_api/src/generic_record.rs b/vello_api/src/generic_record.rs new file mode 100644 index 00000000..cd991d5e --- /dev/null +++ b/vello_api/src/generic_record.rs @@ -0,0 +1,148 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#![allow(unused, reason = "prototyping")] + +use std::sync::Arc; + +use peniko::{kurbo::Rect, Brush}; + +use crate::{InterpolationMode, Path, Record, RenderCtx, ResourceCtx}; + +pub struct GenericRecorder { + cmds: Vec>, +} + +pub struct GenericResources { + inner: RC::Resource, +} + +enum Cmd { + Fill(Path, Brush), + Image( + ::Image, + Rect, + InterpolationMode, + ), +} + +impl GenericRecorder { + #[allow( + clippy::new_without_default, + reason = "didn't we get rid of this lint? it's so annoying" + )] + pub fn new() -> Self { + let cmds = Vec::new(); + Self { cmds } + } + + pub fn play(&self, ctx: &mut RC) { + for cmd in &self.cmds { + match cmd { + Cmd::Fill(path, brush) => ctx.fill(path, brush.into()), + Cmd::Image(image, rect, interp) => ctx.draw_image(image, *rect, *interp), + } + } + } +} + +impl RenderCtx for GenericRecorder { + type Resource = GenericResources; + + fn playback(&mut self, recording: &std::sync::Arc<::Recording>) { + todo!() + } + + fn fill(&mut self, path: &Path, brush: peniko::BrushRef<'_>) { + self.cmds.push(Cmd::Fill(path.clone(), brush.to_owned())); + } + + fn stroke(&mut self, path: &Path, stroke: &peniko::kurbo::Stroke, brush: peniko::BrushRef<'_>) { + todo!() + } + + fn draw_image( + &mut self, + image: &::Image, + dst_rect: peniko::kurbo::Rect, + interp: crate::InterpolationMode, + ) { + let image = image.clone(); + self.cmds.push(Cmd::Image(image, dst_rect, interp)); + } + + fn clip(&mut self, path: &Path) { + todo!() + } + + fn save(&mut self) { + todo!() + } + + fn restore(&mut self) { + todo!() + } + + fn transform(&mut self, affine: peniko::kurbo::Affine) { + todo!() + } + + fn begin_draw_glyphs(&mut self, font: &peniko::Font) { + todo!() + } + + fn font_size(&mut self, size: f32) { + todo!() + } + + fn hint(&mut self, hint: bool) { + todo!() + } + + fn glyph_brush(&mut self, brush: peniko::BrushRef<'_>) { + todo!() + } + + fn draw_glyphs( + &mut self, + style: peniko::StyleRef<'_>, + glyphs: &dyn Iterator, + ) { + todo!() + } + + fn end_draw_glyphs(&mut self) { + todo!() + } +} + +impl ResourceCtx for GenericResources { + type Image = ::Image; + + type Recording = GenericRecorder; + + type Record = GenericRecorder; + + fn record(&mut self) -> Self::Record { + GenericRecorder::new() + } + + fn make_image_with_stride( + &mut self, + width: usize, + height: usize, + stride: usize, + buf: &[u8], + format: crate::ImageFormat, + ) -> Result { + self.inner + .make_image_with_stride(width, height, stride, buf, format) + } +} + +impl Record for GenericRecorder { + fn finish(&mut self) -> Arc<::Recording> { + let cmds = std::mem::take(&mut self.cmds); + Arc::new(Self { cmds }) + } +} diff --git a/vello_api/src/lib.rs b/vello_api/src/lib.rs new file mode 100644 index 00000000..926cd7e9 --- /dev/null +++ b/vello_api/src/lib.rs @@ -0,0 +1,148 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#![allow(missing_docs, reason = "will add them later")] +#![allow(missing_debug_implementations, reason = "prototyping")] +#![allow(clippy::todo, reason = "still a prototype")] + +use std::{ + num::NonZeroU64, + sync::{atomic::AtomicU64, Arc}, +}; + +pub use peniko; + +use peniko::{ + kurbo::{Affine, BezPath, Rect, Stroke}, + BrushRef, Font, StyleRef, +}; + +mod any; +mod generic_record; + +pub use any::{AnyImage, AnyRecord, AnyRenderCtx, AnyResourceCtx, BoxedAnyRecord, BoxedRenderCtx}; +pub use generic_record::{GenericRecorder, GenericResources}; + +#[derive(Clone, Copy, PartialEq, Eq, Hash)] +pub struct Id(NonZeroU64); + +// TODO: think this through +pub type Error = Box; + +#[derive(Clone)] +pub struct Path { + pub id: Id, + pub path: BezPath, + // TODO: Vello encoding. kurbo BezPath can be used in interim + // Question: probably want to special-case rect, line, ellipse at least + // Probably also rounded-rect (incl varying corner radii) +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum ImageFormat { + Grayscale, + Rgb, + RgbaSeparate, + RgbaPremul, +} + +#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)] +pub enum InterpolationMode { + NearestNeighbor, + Bilinear, + // TODO: want to add cubic etc +} + +/// Positioned glyph. This type matches Vello. +pub struct Glyph { + pub id: u32, + pub x: f32, + pub y: f32, +} + +pub trait RenderCtx { + type Resource: ResourceCtx; + + fn playback(&mut self, recording: &Arc<::Recording>); + + // should even-odd be an arg or another method? + fn fill(&mut self, path: &Path, brush: BrushRef<'_>); + + fn stroke(&mut self, path: &Path, stroke: &Stroke, brush: BrushRef<'_>); + + // TODO: clamp/extend/mirror + fn draw_image( + &mut self, + image: &::Image, + dst_rect: Rect, + interp: InterpolationMode, + ); + + fn clip(&mut self, path: &Path); + + fn save(&mut self); + + fn restore(&mut self); + + fn transform(&mut self, affine: Affine); + + /// Start a glyph drawing operation + /// + /// The glyph drawing operation ends with [`RenderCtx::end_draw_glyphs`] + fn begin_draw_glyphs(&mut self, font: &Font); + + // Following methods are borrowed from Vello's DrawGlyph + fn font_size(&mut self, size: f32); + + fn hint(&mut self, hint: bool); + + fn glyph_brush(&mut self, brush: BrushRef<'_>); + + fn draw_glyphs(&mut self, style: StyleRef<'_>, glyphs: &dyn Iterator); + + fn end_draw_glyphs(&mut self); +} + +pub trait Record: RenderCtx { + // It should be possible to take self by move, but that triggers E0161 + fn finish(&mut self) -> Arc<::Recording>; +} + +pub trait ResourceCtx { + type Image: Clone + Send; + + type Recording: Send + ?Sized; + + type Record: Record + Send; + + fn record(&mut self) -> Self::Record; + + fn make_image_with_stride( + &mut self, + width: usize, + height: usize, + stride: usize, + buf: &[u8], + format: ImageFormat, + ) -> Result; +} + +static ID_COUNTER: AtomicU64 = AtomicU64::new(0); + +impl Id { + pub fn get() -> Self { + let n = ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed); + if let Some(x) = n.checked_add(1) { + Self(NonZeroU64::new(x).unwrap()) + } else { + panic!("wow, overflow of u64, congratulations") + } + } +} + +impl From for Path { + fn from(path: BezPath) -> Self { + let id = Id::get(); + Self { id, path } + } +} diff --git a/vello_hybrid/Cargo.toml b/vello_hybrid/Cargo.toml new file mode 100644 index 00000000..2656de48 --- /dev/null +++ b/vello_hybrid/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "vello_hybrid" +version = "0.1.0" +authors = ["Raph Levien "] +description = "An experimental CPU 2D renderer based on sparse strips" +keywords = ["graphics", "2d"] +categories = ["graphics"] +edition.workspace = true +rust-version.workspace = true +license.workspace = true +repository.workspace = true + +[package.metadata.docs.rs] +all-features = true +# There are no platform specific docs. +default-target = "x86_64-unknown-linux-gnu" +targets = [] + +[lints] +workspace = true + +[dependencies] +vello_api = { path = "../vello_api" } +flatten = { git = "https://github.com/linebender/gpu-stroke-expansion-paper", rev = "827ccf6" } + +[dev-dependencies] +png = "0.17.14" +roxmltree = "0.20.0" diff --git a/vello_hybrid/README.md b/vello_hybrid/README.md new file mode 100644 index 00000000..3764d234 --- /dev/null +++ b/vello_hybrid/README.md @@ -0,0 +1,5 @@ +# vello_hybrid + +Experiment in hybrid CPU/GPU rendering. + +TODO: explain diff --git a/vello_hybrid/examples/simple.rs b/vello_hybrid/examples/simple.rs new file mode 100644 index 00000000..e91f0175 --- /dev/null +++ b/vello_hybrid/examples/simple.rs @@ -0,0 +1,41 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#![allow(missing_docs, reason = "will add them later")] +#![allow(clippy::cast_possible_truncation, reason = "we're doing it on purpose")] + +use std::io::BufWriter; + +use vello_api::peniko::color::palette; +use vello_api::peniko::kurbo::{BezPath, Stroke}; +use vello_api::RenderCtx; +use vello_hybrid::{CsRenderCtx, Pixmap}; + +const WIDTH: usize = 1024; +const HEIGHT: usize = 256; + +pub fn main() { + let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT); + let mut path = BezPath::new(); + path.move_to((10.0, 10.0)); + path.line_to((180.0, 20.0)); + path.line_to((30.0, 40.0)); + path.close_path(); + let piet_path = path.into(); + ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into()); + let stroke = Stroke::new(5.0); + ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into()); + if let Some(filename) = std::env::args().nth(1) { + let mut pixmap = Pixmap::new(WIDTH, HEIGHT); + ctx.render_to_pixmap(&mut pixmap); + pixmap.unpremultiply(); + let file = std::fs::File::create(filename).unwrap(); + let w = BufWriter::new(file); + let mut encoder = png::Encoder::new(w, WIDTH as u32, HEIGHT as u32); + encoder.set_color(png::ColorType::Rgba); + let mut writer = encoder.write_header().unwrap(); + writer.write_image_data(pixmap.data()).unwrap(); + } else { + ctx.debug_dump(); + } +} diff --git a/vello_hybrid/examples/svg.rs b/vello_hybrid/examples/svg.rs new file mode 100644 index 00000000..85312b8c --- /dev/null +++ b/vello_hybrid/examples/svg.rs @@ -0,0 +1,350 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#![allow(missing_docs, reason = "will add them later")] +#![allow(missing_debug_implementations, reason = "prototyping")] +#![allow(clippy::cast_possible_truncation, reason = "we're doing it on purpose")] + +//! SVG example for hybrid renderer + +use std::io::BufWriter; +use std::str::FromStr; + +use roxmltree::{Document, Node}; +use vello_api::peniko::color::palette; +use vello_api::peniko::kurbo::{Affine, BezPath, Point, Size, Stroke, Vec2}; +use vello_api::peniko::Color; +use vello_api::RenderCtx; +use vello_hybrid::{CsRenderCtx, Pixmap}; + +const WIDTH: usize = 1024; +const HEIGHT: usize = 1024; + +/// The main function of the example. The German word for main is "Haupt". +pub fn main() { + let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT); + let mut args = std::env::args().skip(1); + let svg_filename = args.next().expect("svg filename is first arg"); + let out_filename = args.next().expect("png out filename is second arg"); + + let svg = std::fs::read_to_string(svg_filename).expect("error reading file"); + let parsed = PicoSvg::load(&svg, 1.0).expect("error parsing SVG"); + let mut pixmap = Pixmap::new(WIDTH, HEIGHT); + // Hacky code for crude measurements; change this to arg parsing + for i in 0..200 { + ctx.reset(); + let start = std::time::Instant::now(); + render_svg(&mut ctx, &parsed.items); + let coarse_time = start.elapsed(); + ctx.render_to_pixmap(&mut pixmap); + if i % 100 == 0 { + println!( + "time to coarse: {coarse_time:?}, time to fine: {:?}", + start.elapsed() + ); + } + } + pixmap.unpremultiply(); + let file = std::fs::File::create(out_filename).unwrap(); + let w = BufWriter::new(file); + let mut encoder = png::Encoder::new(w, WIDTH as u32, HEIGHT as u32); + encoder.set_color(png::ColorType::Rgba); + let mut writer = encoder.write_header().unwrap(); + writer.write_image_data(pixmap.data()).unwrap(); +} + +fn render_svg(ctx: &mut impl RenderCtx, items: &[Item]) { + for item in items { + match item { + Item::Fill(fill_item) => ctx.fill(&fill_item.path, fill_item.color.into()), + Item::Stroke(stroke_item) => { + let style = Stroke::new(stroke_item.width); + ctx.stroke(&stroke_item.path, &style, stroke_item.color.into()); + } + Item::Group(group_item) => { + // TODO: apply transform from group + render_svg(ctx, &group_item.children); + } + } + } +} + +// Below is copied, lightly adapted, from Vello. + +pub struct PicoSvg { + pub items: Vec, + #[allow(unused, reason = "functionality NYI")] + pub size: Size, +} + +pub enum Item { + Fill(FillItem), + Stroke(StrokeItem), + Group(GroupItem), +} + +pub struct StrokeItem { + pub width: f64, + pub color: Color, + pub path: vello_api::Path, +} + +pub struct FillItem { + pub color: Color, + pub path: vello_api::Path, +} + +pub struct GroupItem { + #[allow(unused, reason = "functionality NYI")] + pub affine: Affine, + pub children: Vec, +} + +struct Parser { + scale: f64, +} + +impl PicoSvg { + pub fn load(xml_string: &str, scale: f64) -> Result> { + let doc = Document::parse(xml_string)?; + let root = doc.root_element(); + let mut parser = Parser::new(scale); + let width = root.attribute("width").and_then(|s| f64::from_str(s).ok()); + let height = root.attribute("height").and_then(|s| f64::from_str(s).ok()); + let (origin, viewbox_size) = root + .attribute("viewBox") + .and_then(|vb_attr| { + let vs: Vec = vb_attr + .split(' ') + .map(|s| f64::from_str(s).unwrap()) + .collect(); + if let &[x, y, vb_width, vb_height] = vs.as_slice() { + Some(( + Point { x, y }, + Size { + width: vb_width, + height: vb_height, + }, + )) + } else { + None + } + }) + .unzip(); + + let mut transform = if let Some(origin) = origin { + Affine::translate(origin.to_vec2() * -1.0) + } else { + Affine::IDENTITY + }; + + transform *= match (width, height, viewbox_size) { + (None, None, Some(_)) => Affine::IDENTITY, + (Some(w), Some(h), Some(s)) => { + Affine::scale_non_uniform(1.0 / s.width * w, 1.0 / s.height * h) + } + (Some(w), None, Some(s)) => Affine::scale(1.0 / s.width * w), + (None, Some(h), Some(s)) => Affine::scale(1.0 / s.height * h), + _ => Affine::IDENTITY, + }; + + let size = match (width, height, viewbox_size) { + (None, None, Some(s)) => s, + (mw, mh, None) => Size { + width: mw.unwrap_or(300_f64), + height: mh.unwrap_or(150_f64), + }, + (Some(w), None, Some(s)) => Size { + width: w, + height: 1.0 / w * s.width * s.height, + }, + (None, Some(h), Some(s)) => Size { + width: 1.0 / h * s.height * s.width, + height: h, + }, + (Some(width), Some(height), Some(_)) => Size { width, height }, + }; + + transform *= if scale >= 0.0 { + Affine::scale(scale) + } else { + Affine::new([-scale, 0.0, 0.0, scale, 0.0, 0.0]) + }; + let props = RecursiveProperties { + fill: Some(Color::BLACK), + }; + // The root element is the svg document element, which we don't care about + let mut items = Vec::new(); + for node in root.children() { + parser.rec_parse(node, &props, &mut items)?; + } + let root_group = Item::Group(GroupItem { + affine: transform, + children: items, + }); + Ok(Self { + items: vec![root_group], + size, + }) + } +} + +#[derive(Clone)] +struct RecursiveProperties { + fill: Option, +} + +impl Parser { + fn new(scale: f64) -> Self { + Self { scale } + } + + fn rec_parse( + &mut self, + node: Node<'_, '_>, + properties: &RecursiveProperties, + items: &mut Vec, + ) -> Result<(), Box> { + if node.is_element() { + let mut properties = properties.clone(); + if let Some(fill_color) = node.attribute("fill") { + if fill_color == "none" { + properties.fill = None; + } else { + let color = parse_color(fill_color); + let color = modify_opacity(color, "fill-opacity", node); + // TODO: Handle recursive opacity properly + let color = modify_opacity(color, "opacity", node); + properties.fill = Some(color); + } + } + match node.tag_name().name() { + "g" => { + let mut children = Vec::new(); + let mut affine = Affine::default(); + if let Some(transform) = node.attribute("transform") { + affine = parse_transform(transform); + } + for child in node.children() { + self.rec_parse(child, &properties, &mut children)?; + } + items.push(Item::Group(GroupItem { affine, children })); + } + "path" => { + let d = node.attribute("d").ok_or("missing 'd' attribute")?; + let bp = BezPath::from_svg(d)?; + let path: vello_api::Path = bp.into(); + if let Some(color) = properties.fill { + items.push(Item::Fill(FillItem { + color, + path: path.clone(), + })); + } + if let Some(stroke_color) = node.attribute("stroke") { + if stroke_color != "none" { + let width = node + .attribute("stroke-width") + .map(|a| f64::from_str(a).unwrap_or(1.0)) + .unwrap_or(1.0) + * self.scale.abs(); + let color = parse_color(stroke_color); + let color = modify_opacity(color, "stroke-opacity", node); + // TODO: Handle recursive opacity properly + let color = modify_opacity(color, "opacity", node); + items.push(Item::Stroke(StrokeItem { width, color, path })); + } + } + } + other => eprintln!("Unhandled node type {other}"), + } + } + Ok(()) + } +} + +fn parse_transform(transform: &str) -> Affine { + let mut nt = Affine::IDENTITY; + for ts in transform.split(')').map(str::trim) { + nt *= if let Some(s) = ts.strip_prefix("matrix(") { + let vals = s + .split([',', ' ']) + .map(str::parse) + .collect::, _>>() + .expect("Could parse all values of 'matrix' as floats"); + Affine::new( + vals.try_into() + .expect("Should be six arguments to `matrix`"), + ) + } else if let Some(s) = ts.strip_prefix("translate(") { + if let Ok(vals) = s + .split([',', ' ']) + .map(str::trim) + .map(str::parse) + .collect::, _>>() + { + match vals.as_slice() { + &[x, y] => Affine::translate(Vec2 { x, y }), + _ => Affine::IDENTITY, + } + } else { + Affine::IDENTITY + } + } else if let Some(s) = ts.strip_prefix("scale(") { + if let Ok(vals) = s + .split([',', ' ']) + .map(str::trim) + .map(str::parse) + .collect::, _>>() + { + match *vals.as_slice() { + [x, y] => Affine::scale_non_uniform(x, y), + [x] => Affine::scale(x), + _ => Affine::IDENTITY, + } + } else { + Affine::IDENTITY + } + } else if let Some(s) = ts.strip_prefix("scaleX(") { + s.trim() + .parse() + .ok() + .map(|x| Affine::scale_non_uniform(x, 1.0)) + .unwrap_or(Affine::IDENTITY) + } else if let Some(s) = ts.strip_prefix("scaleY(") { + s.trim() + .parse() + .ok() + .map(|y| Affine::scale_non_uniform(1.0, y)) + .unwrap_or(Affine::IDENTITY) + } else { + if !ts.is_empty() { + eprintln!("Did not understand transform attribute {ts:?})"); + } + Affine::IDENTITY + }; + } + nt +} + +fn parse_color(color: &str) -> Color { + let color = color.trim(); + if let Ok(c) = vello_api::peniko::color::parse_color(color) { + c.to_alpha_color() + } else { + palette::css::MAGENTA.with_alpha(0.5) + } +} + +fn modify_opacity(color: Color, attr_name: &str, node: Node<'_, '_>) -> Color { + if let Some(opacity) = node.attribute(attr_name) { + let alpha: f64 = if let Some(o) = opacity.strip_suffix('%') { + let pctg = o.parse().unwrap_or(100.0); + pctg * 0.01 + } else { + opacity.parse().unwrap_or(1.0) + }; + color.with_alpha(alpha as f32) + } else { + color + } +} diff --git a/vello_hybrid/src/fine.rs b/vello_hybrid/src/fine.rs new file mode 100644 index 00000000..4b935528 --- /dev/null +++ b/vello_hybrid/src/fine.rs @@ -0,0 +1,124 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Fine rasterization + +use crate::wide_tile::{Cmd, STRIP_HEIGHT, WIDE_TILE_WIDTH}; + +const STRIP_HEIGHT_F32: usize = STRIP_HEIGHT * 4; + +pub(crate) struct Fine<'a> { + pub(crate) width: usize, + pub(crate) height: usize, + // rgba pixels + pub(crate) out_buf: &'a mut [u8], + // f32 RGBA pixels + // That said, if we use u8, then this is basically a block of + // untyped memory. + pub(crate) scratch: [f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4], + #[allow(clippy::doc_markdown, reason = "false positive for x86_64")] + /// Whether to use SIMD + /// + /// This is useful to toggle for performance evaluation reasons. It also + /// *must* be false if runtime detection fails, otherwise we have safety + /// problems. This is important for x86_64, as we'll be targeting Haswell + /// as the minimum. + #[allow(unused, reason = "some platforms might not have SIMD")] + // The allow(unused) lint exception is because some platforms may not have + // a SIMD implementation, and thus won't check the field. + pub(crate) use_simd: bool, +} + +impl<'a> Fine<'a> { + pub(crate) fn new(width: usize, height: usize, out_buf: &'a mut [u8]) -> Self { + let scratch = [0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]; + Self { + width, + height, + out_buf, + scratch, + use_simd: true, + } + } + + pub(crate) fn clear_scalar(&mut self, color: [f32; 4]) { + for z in self.scratch.chunks_exact_mut(4) { + z.copy_from_slice(&color); + } + } + + pub(crate) fn pack_scalar(&mut self, x: usize, y: usize) { + // Note that these can trigger if the method is called on a pixmap that + // is not an integral multiple of the tile. + assert!( + (x + 1) * WIDE_TILE_WIDTH <= self.width, + "overflow of pixmap width" + ); + assert!( + (y + 1) * STRIP_HEIGHT <= self.height, + "overflow of pixmap height" + ); + let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4; + for j in 0..STRIP_HEIGHT { + let line_ix = base_ix + j * self.width * 4; + for i in 0..WIDE_TILE_WIDTH { + let mut rgba_f32 = [0.0; 4]; + rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]); + let rgba_u8 = rgba_f32.map(|z| (z * 255.0).round() as u8); + self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8); + } + } + } + + pub(crate) fn run_cmd(&mut self, cmd: &Cmd, alphas: &[u32]) { + match cmd { + Cmd::Fill(f) => { + self.fill(f.x as usize, f.width as usize, f.color.components); + } + Cmd::Strip(s) => { + let aslice = &alphas[s.alpha_ix..]; + self.strip(s.x as usize, s.width as usize, aslice, s.color.components); + } + } + } + + pub(crate) fn fill_scalar(&mut self, x: usize, width: usize, color: [f32; 4]) { + if color[3] == 1.0 { + for z in + self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) + { + z.copy_from_slice(&color); + } + } else { + let one_minus_alpha = 1.0 - color[3]; + for z in + self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) + { + for i in 0..4 { + //z[i] = color[i] + one_minus_alpha * z[i]; + // Note: the mul_add will perform poorly on x86_64 default cpu target + // Probably right thing to do is craft a #cfg that detects fma, fcma, etc. + // What we really want is fmuladdf32 from intrinsics! + z[i] = z[i].mul_add(one_minus_alpha, color[i]); + } + } + } + } + + pub(crate) fn strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) { + debug_assert!(alphas.len() >= width, "overflow of alphas buffer"); + let cs = color.map(|z| z * (1.0 / 255.0)); + for (z, a) in self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width] + .chunks_exact_mut(16) + .zip(alphas) + { + for j in 0..4 { + let mask_alpha = ((*a >> (j * 8)) & 0xff) as f32; + let one_minus_alpha = 1.0 - mask_alpha * cs[3]; + for i in 0..4 { + z[j * 4 + i] = z[j * 4 + i].mul_add(one_minus_alpha, mask_alpha * cs[i]); + } + } + } + } +} diff --git a/vello_hybrid/src/flatten.rs b/vello_hybrid/src/flatten.rs new file mode 100644 index 00000000..1d0bbc58 --- /dev/null +++ b/vello_hybrid/src/flatten.rs @@ -0,0 +1,51 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Utilities for flattening + +use flatten::stroke::LoweredPath; +use vello_api::peniko::kurbo::{self, Affine, BezPath, Line, Point, Stroke}; + +use crate::tiling::FlatLine; + +/// The flattening tolerance +const TOL: f64 = 0.25; + +pub(crate) fn fill(path: &BezPath, affine: Affine, line_buf: &mut Vec) { + line_buf.clear(); + let mut start = Point::default(); + let mut p0 = Point::default(); + let iter = path.iter().map(|el| affine * el); + kurbo::flatten(iter, TOL, |el| match el { + kurbo::PathEl::MoveTo(p) => { + start = p; + p0 = p; + } + kurbo::PathEl::LineTo(p) => { + let pt0 = [p0.x as f32, p0.y as f32]; + let pt1 = [p.x as f32, p.y as f32]; + line_buf.push(FlatLine::new(pt0, pt1)); + p0 = p; + } + kurbo::PathEl::QuadTo(_, _) => unreachable!(), + kurbo::PathEl::CurveTo(_, _, _) => unreachable!(), + kurbo::PathEl::ClosePath => { + let pt0 = [p0.x as f32, p0.y as f32]; + let pt1 = [start.x as f32, start.y as f32]; + if pt0 != pt1 { + line_buf.push(FlatLine::new(pt0, pt1)); + } + } + }); +} + +pub(crate) fn stroke(path: &BezPath, style: &Stroke, affine: Affine, line_buf: &mut Vec) { + line_buf.clear(); + let iter = path.iter().map(|el| affine * el); + let lines: LoweredPath = flatten::stroke::stroke_undashed(iter, style, TOL); + for line in &lines.path { + let p0 = [line.p0.x as f32, line.p0.y as f32]; + let p1 = [line.p1.x as f32, line.p1.y as f32]; + line_buf.push(FlatLine::new(p0, p1)); + } +} diff --git a/vello_hybrid/src/lib.rs b/vello_hybrid/src/lib.rs new file mode 100644 index 00000000..341c6e90 --- /dev/null +++ b/vello_hybrid/src/lib.rs @@ -0,0 +1,20 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +#![allow(missing_docs, reason = "will add them later")] +#![allow(missing_debug_implementations, reason = "prototyping")] +#![allow(clippy::todo, reason = "still a prototype")] +#![allow(clippy::cast_possible_truncation, reason = "we need to do this a lot")] + +mod fine; +mod flatten; +mod pixmap; +mod render; +mod simd; +mod strip; +mod tiling; +mod wide_tile; + +pub use pixmap::Pixmap; +pub use render::{CsRenderCtx, CsResourceCtx}; +pub use tiling::FlatLine; diff --git a/vello_hybrid/src/pixmap.rs b/vello_hybrid/src/pixmap.rs new file mode 100644 index 00000000..7a40bef6 --- /dev/null +++ b/vello_hybrid/src/pixmap.rs @@ -0,0 +1,39 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! A simple pixmap type + +pub struct Pixmap { + pub(crate) width: usize, + pub(crate) height: usize, + pub(crate) buf: Vec, +} + +impl Pixmap { + pub fn new(width: usize, height: usize) -> Self { + let buf = vec![0; width * height * 4]; + Self { width, height, buf } + } + + pub fn data(&self) -> &[u8] { + &self.buf + } + + pub fn data_mut(&mut self) -> &mut [u8] { + &mut self.buf + } + + /// Convert from premultiplied to separate alpha. + /// + /// Not fast, but useful for saving to PNG etc. + pub fn unpremultiply(&mut self) { + for rgba in self.buf.chunks_exact_mut(4) { + let alpha = rgba[3] as f32 * (1.0 / 255.0); + if alpha != 0.0 { + rgba[0] = (rgba[0] as f32 / alpha).round().min(255.0) as u8; + rgba[1] = (rgba[1] as f32 / alpha).round().min(255.0) as u8; + rgba[2] = (rgba[2] as f32 / alpha).round().min(255.0) as u8; + } + } + } +} diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs new file mode 100644 index 00000000..8216ef5a --- /dev/null +++ b/vello_hybrid/src/render.rs @@ -0,0 +1,282 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// Remove when all methods are implemented. +#![allow(unused, reason = "lots of unused arguments from todo methods")] + +use std::collections::BTreeMap; + +use vello_api::{ + peniko::{ + color::{palette, AlphaColor, Srgb}, + kurbo::Affine, + BrushRef, + }, + GenericRecorder, RenderCtx, ResourceCtx, +}; + +use crate::{ + fine::Fine, + strip::{self, Strip, Tile}, + tiling::{self, FlatLine}, + wide_tile::{Cmd, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH}, + Pixmap, +}; + +pub struct CsRenderCtx { + width: usize, + height: usize, + tiles: Vec, + alphas: Vec, + + /// These are all scratch buffers, to be used for path rendering. They're here solely + /// so the allocations can be reused. + line_buf: Vec, + tile_buf: Vec, + strip_buf: Vec, +} + +pub struct CsResourceCtx; + +impl CsRenderCtx { + pub fn new(width: usize, height: usize) -> Self { + let width_tiles = width.div_ceil(WIDE_TILE_WIDTH); + let height_tiles = height.div_ceil(STRIP_HEIGHT); + let tiles = (0..width_tiles * height_tiles) + .map(|_| WideTile::default()) + .collect(); + let alphas = vec![]; + let line_buf = vec![]; + let tile_buf = vec![]; + let strip_buf = vec![]; + Self { + width, + height, + tiles, + alphas, + line_buf, + tile_buf, + strip_buf, + } + } + + pub fn reset(&mut self) { + for tile in &mut self.tiles { + tile.bg = AlphaColor::TRANSPARENT; + tile.cmds.clear(); + } + } + + pub fn render_to_pixmap(&self, pixmap: &mut Pixmap) { + let mut fine = Fine::new(pixmap.width, pixmap.height, &mut pixmap.buf); + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + let height_tiles = (self.height).div_ceil(STRIP_HEIGHT); + for y in 0..height_tiles { + for x in 0..width_tiles { + let tile = &self.tiles[y * width_tiles + x]; + fine.clear(tile.bg.components); + for cmd in &tile.cmds { + fine.run_cmd(cmd, &self.alphas); + } + fine.pack(x, y); + } + } + } + + pub fn tile_stats(&self) { + let mut histo = BTreeMap::new(); + let mut total = 0; + for tile in &self.tiles { + let count = tile.cmds.len(); + total += count; + *histo.entry(count).or_insert(0) += 1; + } + println!("total = {total}, {histo:?}"); + } + + /// Render a path, which has already been flattened into `line_buf`. + fn render_path(&mut self, brush: BrushRef<'_>) { + // TODO: need to make sure tiles contained in viewport - we'll likely + // panic otherwise. + tiling::make_tiles(&self.line_buf, &mut self.tile_buf); + self.tile_buf.sort_unstable_by(Tile::cmp); + crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas); + let color = brush_to_color(brush); + let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH); + for i in 0..self.strip_buf.len() - 1 { + let strip = &self.strip_buf[i]; + let next_strip = &self.strip_buf[i + 1]; + let x0 = strip.x(); + let y = strip.strip_y(); + let row_start = y as usize * width_tiles; + let strip_width = next_strip.col - strip.col; + let x1 = x0 + strip_width; + let xtile0 = x0 as usize / WIDE_TILE_WIDTH; + let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH); + let mut x = x0; + let mut col = strip.col; + for xtile in xtile0..xtile1 { + let x_tile_rel = x % WIDE_TILE_WIDTH as u32; + let width = x1.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x; + let cmd = CmdStrip { + x: x_tile_rel, + width, + alpha_ix: col as usize, + color, + }; + x += width; + col += width; + self.tiles[row_start + xtile].push(Cmd::Strip(cmd)); + } + if next_strip.winding != 0 && y == next_strip.strip_y() { + x = x1; + let x2 = next_strip.x(); + let fxt0 = x1 as usize / WIDE_TILE_WIDTH; + let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH); + for xtile in fxt0..fxt1 { + let x_tile_rel = x % WIDE_TILE_WIDTH as u32; + let width = x2.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x; + x += width; + self.tiles[row_start + xtile].fill(x_tile_rel, width, color); + } + } + } + } + + pub fn debug_dump(&self) { + let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH); + for (i, tile) in self.tiles.iter().enumerate() { + if !tile.cmds.is_empty() || tile.bg.components[3] != 0.0 { + let x = i % width_tiles; + let y = i / width_tiles; + println!("tile {x}, {y} bg {}", tile.bg.to_rgba8()); + for cmd in &tile.cmds { + println!("{cmd:?}"); + } + } + } + } + + fn get_affine(&self) -> Affine { + // TODO: get from graphics state + Affine::scale(5.0) + } +} + +impl RenderCtx for CsRenderCtx { + type Resource = CsResourceCtx; + + fn playback( + &mut self, + recording: &std::sync::Arc<::Recording>, + ) { + recording.play(self); + } + + fn fill(&mut self, path: &vello_api::Path, brush: BrushRef<'_>) { + let affine = self.get_affine(); + crate::flatten::fill(&path.path, affine, &mut self.line_buf); + self.render_path(brush); + } + + fn stroke( + &mut self, + path: &vello_api::Path, + stroke: &vello_api::peniko::kurbo::Stroke, + brush: BrushRef<'_>, + ) { + let affine = self.get_affine(); + crate::flatten::stroke(&path.path, stroke, affine, &mut self.line_buf); + self.render_path(brush); + } + + fn draw_image( + &mut self, + image: &::Image, + dst_rect: vello_api::peniko::kurbo::Rect, + interp: vello_api::InterpolationMode, + ) { + todo!() + } + + fn clip(&mut self, path: &vello_api::Path) { + todo!() + } + + fn save(&mut self) { + todo!() + } + + fn restore(&mut self) { + todo!() + } + + fn transform(&mut self, affine: vello_api::peniko::kurbo::Affine) { + todo!() + } + + fn begin_draw_glyphs(&mut self, font: &vello_api::peniko::Font) { + todo!() + } + + fn font_size(&mut self, size: f32) { + todo!() + } + + fn hint(&mut self, hint: bool) { + todo!() + } + + fn glyph_brush(&mut self, brush: BrushRef<'_>) { + todo!() + } + + fn draw_glyphs( + &mut self, + style: vello_api::peniko::StyleRef<'_>, + glyphs: &dyn Iterator, + ) { + todo!() + } + + fn end_draw_glyphs(&mut self) { + todo!() + } +} + +impl ResourceCtx for CsResourceCtx { + type Image = (); + + type Recording = GenericRecorder; + + type Record = GenericRecorder; + + fn record(&mut self) -> Self::Record { + GenericRecorder::new() + } + + fn make_image_with_stride( + &mut self, + width: usize, + height: usize, + stride: usize, + buf: &[u8], + format: vello_api::ImageFormat, + ) -> Result { + todo!() + } +} + +/// Get the color from the brush. +/// +/// This is a hacky function that will go away when we implement +/// other brushes. The general form is to match on whether it's a +/// solid color. If not, then issue a cmd to render the brush into +/// a brush buffer, then fill/strip as needed to composite into +/// the main buffer. +fn brush_to_color(brush: BrushRef<'_>) -> AlphaColor { + match brush { + BrushRef::Solid(c) => c, + _ => palette::css::MAGENTA, + } +} diff --git a/vello_hybrid/src/simd.rs b/vello_hybrid/src/simd.rs new file mode 100644 index 00000000..c2a2614e --- /dev/null +++ b/vello_hybrid/src/simd.rs @@ -0,0 +1,84 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! SIMD speedups + +use crate::{ + fine::Fine, + strip::{Strip, Tile}, +}; + +#[cfg(target_arch = "aarch64")] +pub(crate) mod neon; + +// This block is when we have SIMD +#[cfg(target_arch = "aarch64")] +impl Fine<'_> { + pub(crate) fn pack(&mut self, x: usize, y: usize) { + if self.use_simd { + self.pack_simd(x, y); + } else { + self.pack_scalar(x, y); + } + } + + pub(crate) fn clear(&mut self, color: [f32; 4]) { + if self.use_simd { + unsafe { + self.clear_simd(color); + } + } else { + self.clear_scalar(color); + } + } + + pub(crate) fn fill(&mut self, x: usize, width: usize, color: [f32; 4]) { + if self.use_simd { + unsafe { + self.fill_simd(x, width, color); + } + } else { + self.fill_scalar(x, width, color); + } + } + + pub(crate) fn strip(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) { + if self.use_simd { + unsafe { + self.strip_simd(x, width, alphas, color); + } + } else { + self.strip_scalar(x, width, alphas, color); + } + } +} + +#[cfg(target_arch = "aarch64")] +pub(crate) fn render_strips(tiles: &[Tile], strip_buf: &mut Vec, alpha_buf: &mut Vec) { + neon::render_strips_simd(tiles, strip_buf, alpha_buf); +} + +#[cfg(not(target_arch = "aarch64"))] +pub fn render_strips(tiles: &[Tile], strip_buf: &mut Vec, alpha_buf: &mut Vec) { + crate::strip::render_strips_scalar(tiles, strip_buf, alpha_buf); +} + +// This block is the fallback, no SIMD +#[cfg(not(target_arch = "aarch64"))] +impl<'a> Fine<'a> { + pub(crate) fn pack(&mut self, x: usize, y: usize) { + self.pack_scalar(x, y); + } + + pub(crate) fn clear(&mut self, color: [f32; 4]) { + self.clear_scalar(color); + } + + pub(crate) fn fill(&mut self, x: usize, y: usize, color: [f32; 4]) { + self.fill_scalar(x, y, color); + } + + pub(crate) fn strip(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) { + self.strip_scalar(x, width, alphas, color); + } +} diff --git a/vello_hybrid/src/simd/neon.rs b/vello_hybrid/src/simd/neon.rs new file mode 100644 index 00000000..72bf9202 --- /dev/null +++ b/vello_hybrid/src/simd/neon.rs @@ -0,0 +1,232 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! SIMD speedups for Neon + +use core::arch::aarch64::*; + +use crate::{ + fine::Fine, + strip::{Strip, Tile}, + tiling::Vec2, + wide_tile::{STRIP_HEIGHT, WIDE_TILE_WIDTH}, +}; + +impl Fine<'_> { + pub(crate) unsafe fn clear_simd(&mut self, color: [f32; 4]) { + unsafe { + let v_color = vld1q_f32(color.as_ptr()); + let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color); + for i in 0..WIDE_TILE_WIDTH { + vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4); + } + } + } + + pub(crate) fn pack_simd(&mut self, x: usize, y: usize) { + unsafe fn cvt(v: float32x4_t) -> uint8x16_t { + unsafe { + let clamped = vminq_f32(v, vdupq_n_f32(1.0)); + let scaled = vmulq_f32(clamped, vdupq_n_f32(255.0)); + vreinterpretq_u8_u32(vcvtnq_u32_f32(scaled)) + } + } + + unsafe fn cvt2(v0: float32x4_t, v1: float32x4_t) -> uint8x16_t { + unsafe { vuzp1q_u8(cvt(v0), cvt(v1)) } + } + + unsafe { + let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4; + for i in (0..WIDE_TILE_WIDTH).step_by(4) { + let chunk_ix = base_ix + i * 4; + let v0 = vld1q_f32_x4(self.scratch.as_ptr().add(i * 16)); + let v1 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 1) * 16)); + let x0 = cvt2(v0.0, v1.0); + let x1 = cvt2(v0.1, v1.1); + let x2 = cvt2(v0.2, v1.2); + let x3 = cvt2(v0.3, v1.3); + let v2 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 2) * 16)); + let v3 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 3) * 16)); + let x4 = cvt2(v2.0, v3.0); + let y0 = vuzp1q_u8(x0, x4); + vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix), y0); + let x5 = cvt2(v2.1, v3.1); + let y1 = vuzp1q_u8(x1, x5); + vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix + self.width * 4), y1); + let x6 = cvt2(v2.2, v3.2); + let y2 = vuzp1q_u8(x2, x6); + vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix + self.width * 8), y2); + let x7 = cvt2(v2.3, v3.3); + let y3 = vuzp1q_u8(x3, x7); + vst1q_u8( + self.out_buf.as_mut_ptr().add(chunk_ix + self.width * 12), + y3, + ); + } + } + } + + pub(crate) unsafe fn fill_simd(&mut self, x: usize, width: usize, color: [f32; 4]) { + unsafe { + let v_color = vld1q_f32(color.as_ptr()); + let alpha = color[3]; + if alpha == 1.0 { + let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color); + for i in x..x + width { + vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4); + } + } else { + let one_minus_alpha = vdupq_n_f32(1.0 - alpha); + for i in x..x + width { + let ix = (x + i) * 16; + let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix)); + v.0 = vfmaq_f32(v_color, v.0, one_minus_alpha); + v.1 = vfmaq_f32(v_color, v.1, one_minus_alpha); + v.2 = vfmaq_f32(v_color, v.2, one_minus_alpha); + v.3 = vfmaq_f32(v_color, v.3, one_minus_alpha); + vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v); + } + } + } + } + + #[inline(never)] + pub(crate) unsafe fn strip_simd( + &mut self, + x: usize, + width: usize, + alphas: &[u32], + color: [f32; 4], + ) { + unsafe { + debug_assert!(alphas.len() >= width, "overflow of alphas buffer"); + let v_color = vmulq_f32(vld1q_f32(color.as_ptr()), vdupq_n_f32(1.0 / 255.0)); + for i in 0..width { + let a = *alphas.get_unchecked(i); + // all this zipping compiles to tbl, we should probably just write that + let a1 = vreinterpret_u8_u32(vdup_n_u32(a)); + let a2 = vreinterpret_u16_u8(vzip1_u8(a1, vdup_n_u8(0))); + let a3 = vcombine_u16(a2, vdup_n_u16(0)); + let a4 = vreinterpretq_u32_u16(vzip1q_u16(a3, vdupq_n_u16(0))); + let alpha = vcvtq_f32_u32(a4); + let ix = (x + i) * 16; + let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix)); + let one_minus_alpha = vfmsq_laneq_f32(vdupq_n_f32(1.0), alpha, v_color, 3); + v.0 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 0), v.0, one_minus_alpha, 0); + v.1 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 1), v.1, one_minus_alpha, 1); + v.2 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 2), v.2, one_minus_alpha, 2); + v.3 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 3), v.3, one_minus_alpha, 3); + vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v); + } + } + } +} + +#[inline(never)] +pub(crate) fn render_strips_simd( + tiles: &[Tile], + strip_buf: &mut Vec, + alpha_buf: &mut Vec, +) { + unsafe { + strip_buf.clear(); + let mut strip_start = true; + let mut cols = alpha_buf.len() as u32; + let mut prev_tile = &tiles[0]; + let mut fp = prev_tile.footprint().0; + let mut seg_start = 0; + let mut delta = 0; + // Note: the input should contain a sentinel tile, to avoid having + // logic here to process the final strip. + const IOTA: [f32; 4] = [0.0, 1.0, 2.0, 3.0]; + let iota = vld1q_f32(IOTA.as_ptr()); + for i in 1..tiles.len() { + let tile = &tiles[i]; + if prev_tile.loc() != tile.loc() { + let start_delta = delta; + let same_strip = prev_tile.loc().same_strip(&tile.loc()); + if same_strip { + fp |= 8; + } + let x0 = fp.trailing_zeros(); + let x1 = 32 - fp.leading_zeros(); + let mut areas = [[start_delta as f32; 4]; 4]; + for this_tile in &tiles[seg_start..i] { + // small gain possible here to unpack in simd, but llvm goes halfway + delta += this_tile.delta(); + let p0 = Vec2::unpack(this_tile.p0); + let p1 = Vec2::unpack(this_tile.p1); + let slope = (p1.x - p0.x) / (p1.y - p0.y); + let vstarty = vsubq_f32(vdupq_n_f32(p0.y), iota); + let vy0 = vminq_f32(vmaxq_f32(vstarty, vdupq_n_f32(0.0)), vdupq_n_f32(1.0)); + let vy1a = vsubq_f32(vdupq_n_f32(p1.y), iota); + let vy1 = vminq_f32(vmaxq_f32(vy1a, vdupq_n_f32(0.0)), vdupq_n_f32(1.0)); + let vdy = vsubq_f32(vy0, vy1); + let mask = vceqzq_f32(vdy); + let vslope = vbslq_f32(mask, vdupq_n_f32(0.0), vdupq_n_f32(slope)); + let vdy0 = vsubq_f32(vy0, vstarty); + let vdy1 = vsubq_f32(vy1, vstarty); + let mut vyedge = vdupq_n_f32(0.0); + if p0.x == 0.0 { + let ye = vsubq_f32(vdupq_n_f32(1.0), vstarty); + vyedge = vminq_f32(vmaxq_f32(ye, vdupq_n_f32(0.0)), vdupq_n_f32(1.0)); + } else if p1.x == 0.0 { + let ye = vsubq_f32(vy1a, vdupq_n_f32(1.0)); + vyedge = vminq_f32(vmaxq_f32(ye, vdupq_n_f32(-1.0)), vdupq_n_f32(0.0)); + } + for x in x0..x1 { + let mut varea = vld1q_f32(areas.as_ptr().add(x as usize) as *const f32); + varea = vaddq_f32(varea, vyedge); + let vstartx = vdupq_n_f32(p0.x - x as f32); + let vxx0 = vfmaq_f32(vstartx, vdy0, vslope); + let vxx1 = vfmaq_f32(vstartx, vdy1, vslope); + let vxmin0 = vminq_f32(vxx0, vxx1); + let vxmax = vmaxq_f32(vxx0, vxx1); + let vxmin = + vsubq_f32(vminq_f32(vxmin0, vdupq_n_f32(1.0)), vdupq_n_f32(1e-6)); + let vb = vminq_f32(vxmax, vdupq_n_f32(1.0)); + let vc = vmaxq_f32(vb, vdupq_n_f32(0.0)); + let vd = vmaxq_f32(vxmin, vdupq_n_f32(0.0)); + let vd2 = vmulq_f32(vd, vd); + let vd2c2 = vfmsq_f32(vd2, vc, vc); + let vax = vfmaq_f32(vb, vd2c2, vdupq_n_f32(0.5)); + let va = vdivq_f32(vsubq_f32(vax, vxmin), vsubq_f32(vxmax, vxmin)); + varea = vfmaq_f32(varea, va, vdy); + vst1q_f32(areas.as_mut_ptr().add(x as usize) as *mut f32, varea); + } + } + for x in x0..x1 { + let mut alphas = 0_u32; + let varea = vld1q_f32(areas.as_ptr().add(x as usize) as *const f32); + let vnzw = vminq_f32(vabsq_f32(varea), vdupq_n_f32(1.0)); + let vscaled = vmulq_f32(vnzw, vdupq_n_f32(255.0)); + let vbits = vreinterpretq_u8_u32(vcvtnq_u32_f32(vscaled)); + let vbits2 = vuzp1q_u8(vbits, vbits); + let vbits3 = vreinterpretq_u32_u8(vuzp1q_u8(vbits2, vbits2)); + vst1q_lane_u32::<0>(&mut alphas, vbits3); + alpha_buf.push(alphas); + } + + if strip_start { + let xy = (1 << 18) * prev_tile.y as u32 + 4 * prev_tile.x as u32 + x0; + let strip = Strip { + xy, + col: cols, + winding: start_delta, + }; + strip_buf.push(strip); + } + cols += x1 - x0; + fp = if same_strip { 1 } else { 0 }; + strip_start = !same_strip; + seg_start = i; + if !prev_tile.loc().same_row(&tile.loc()) { + delta = 0; + } + } + fp |= tile.footprint().0; + prev_tile = tile; + } + } +} diff --git a/vello_hybrid/src/strip.rs b/vello_hybrid/src/strip.rs new file mode 100644 index 00000000..9f3b1e1e --- /dev/null +++ b/vello_hybrid/src/strip.rs @@ -0,0 +1,209 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! CPU implementation of sparse strip rendering +//! +//! This is copied from the most recent GPU implementation, but has +//! `path_id` stripped out, as on CPU we'll be doing one path at a time. +//! That decision makes sense to some extent even when uploading to +//! GPU, though some mechanism is required to tie the strips to paint. +//! +//! If there becomes a single, unified code base for this, then the +//! `path_id` type should probably become a generic parameter. + +use crate::{tiling::Vec2, wide_tile::STRIP_HEIGHT}; + +#[derive(Clone, Copy, PartialEq, Eq)] +pub(crate) struct Loc { + x: u16, + y: u16, +} + +pub(crate) struct Footprint(pub(crate) u32); + +pub(crate) struct Tile { + pub x: u16, + pub y: u16, + pub p0: u32, // packed + pub p1: u32, // packed +} + +impl std::fmt::Debug for Tile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let p0 = Vec2::unpack(self.p0); + let p1 = Vec2::unpack(self.p1); + write!( + f, + "Tile {{ xy: ({}, {}), p0: ({:.4}, {:.4}), p1: ({:.4}, {:.4}) }}", + self.x, self.y, p0.x, p0.y, p1.x, p1.y + ) + } +} + +#[repr(C)] +#[derive(Debug, Clone, Copy)] +pub(crate) struct Strip { + pub xy: u32, // this could be u16's on the Rust side + pub col: u32, + pub winding: i32, +} + +impl Loc { + pub(crate) fn same_strip(&self, other: &Self) -> bool { + self.same_row(other) && (other.x - self.x) / 2 == 0 + } + + pub(crate) fn same_row(&self, other: &Self) -> bool { + self.y == other.y + } +} + +impl Tile { + #[allow(unused, reason = "only used for synthetic data")] + /// Create a tile from synthetic data. + fn new(loc: Loc, footprint: Footprint, delta: i32) -> Self { + let p0 = (delta == -1) as u32 * 65536 + footprint.0.trailing_zeros() * 8192; + let p1 = (delta == 1) as u32 * 65536 + (32 - footprint.0.leading_zeros()) * 8192; + Self { + x: loc.x, + y: loc.y, + p0, + p1, + } + } + + pub(crate) fn loc(&self) -> Loc { + Loc { + x: self.x, + y: self.y, + } + } + + pub(crate) fn footprint(&self) -> Footprint { + let x0 = (self.p0 & 0xffff) as f32 * (1.0 / 8192.0); + let x1 = (self.p1 & 0xffff) as f32 * (1.0 / 8192.0); + // On CPU, might be better to do this as fixed point + let xmin = x0.min(x1).floor() as u32; + let xmax = (xmin + 1).max(x0.max(x1).ceil() as u32); + Footprint((1 << xmax) - (1 << xmin)) + } + + pub(crate) fn delta(&self) -> i32 { + ((self.p1 >> 16) == 0) as i32 - ((self.p0 >> 16) == 0) as i32 + } + + // Comparison function for sorting. Only compares loc, doesn't care + // about points. Unpacking code has been validated to be efficient in + // Godbolt. + pub(crate) fn cmp(&self, b: &Self) -> std::cmp::Ordering { + let xya = ((self.y as u32) << 16) + (self.x as u32); + let xyb = ((b.y as u32) << 16) + (b.x as u32); + xya.cmp(&xyb) + } +} + +// This can be unused when SIMD is selected. Probably a good idea to make it +// selectable at runtime; will be needed for AVX2. +#[allow(unused, reason = "may be unused when SIMD is selected")] +pub(crate) fn render_strips_scalar( + tiles: &[Tile], + strip_buf: &mut Vec, + alpha_buf: &mut Vec, +) { + strip_buf.clear(); + let mut strip_start = true; + let mut cols = alpha_buf.len() as u32; + let mut prev_tile = &tiles[0]; + let mut fp = prev_tile.footprint().0; + let mut seg_start = 0; + let mut delta = 0; + // Note: the input should contain a sentinel tile, to avoid having + // logic here to process the final strip. + for i in 1..tiles.len() { + let tile = &tiles[i]; + if prev_tile.loc() != tile.loc() { + let start_delta = delta; + let same_strip = prev_tile.loc().same_strip(&tile.loc()); + if same_strip { + fp |= 8; + } + let x0 = fp.trailing_zeros(); + let x1 = 32 - fp.leading_zeros(); + let mut areas = [[start_delta as f32; 4]; 4]; + for this_tile in &tiles[seg_start..i] { + delta += this_tile.delta(); + let p0 = Vec2::unpack(this_tile.p0); + let p1 = Vec2::unpack(this_tile.p1); + let slope = (p1.x - p0.x) / (p1.y - p0.y); + for x in x0..x1 { + let startx = p0.x - x as f32; + for y in 0..4 { + let starty = p0.y - y as f32; + let y0 = starty.clamp(0.0, 1.0); + let y1 = (p1.y - y as f32).clamp(0.0, 1.0); + let dy = y0 - y1; + // Note: getting rid of this predicate might help with + // auto-vectorization. That said, just getting rid of + // it causes artifacts (which may be divide by zero). + if dy != 0.0 { + let xx0 = startx + (y0 - starty) * slope; + let xx1 = startx + (y1 - starty) * slope; + let xmin0 = xx0.min(xx1); + let xmax = xx0.max(xx1); + let xmin = xmin0.min(1.0) - 1e-6; + let b = xmax.min(1.0); + let c = b.max(0.0); + let d = xmin.max(0.0); + let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin); + areas[x as usize][y] += a * dy; + } + if p0.x == 0.0 { + areas[x as usize][y] += (y as f32 - p0.y + 1.0).clamp(0.0, 1.0); + } else if p1.x == 0.0 { + areas[x as usize][y] -= (y as f32 - p1.y + 1.0).clamp(0.0, 1.0); + } + } + } + } + for x in x0..x1 { + let mut alphas = 0_u32; + for y in 0..4 { + let area = areas[x as usize][y]; + // nonzero winding number rule + let area_u8 = (area.abs().min(1.0) * 255.0).round() as u32; + alphas += area_u8 << (y * 8); + } + alpha_buf.push(alphas); + } + + if strip_start { + let xy = (1 << 18) * prev_tile.y as u32 + 4 * prev_tile.x as u32 + x0; + let strip = Strip { + xy, + col: cols, + winding: start_delta, + }; + strip_buf.push(strip); + } + cols += x1 - x0; + fp = if same_strip { 1 } else { 0 }; + strip_start = !same_strip; + seg_start = i; + if !prev_tile.loc().same_row(&tile.loc()) { + delta = 0; + } + } + fp |= tile.footprint().0; + prev_tile = tile; + } +} + +impl Strip { + pub(crate) fn x(&self) -> u32 { + self.xy & 0xffff + } + + pub(crate) fn strip_y(&self) -> u32 { + self.xy / ((1 << 16) * STRIP_HEIGHT as u32) + } +} diff --git a/vello_hybrid/src/tiling.rs b/vello_hybrid/src/tiling.rs new file mode 100644 index 00000000..dd6acdb2 --- /dev/null +++ b/vello_hybrid/src/tiling.rs @@ -0,0 +1,305 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use crate::strip::Tile; + +const TILE_WIDTH: u32 = 4; +const TILE_HEIGHT: u32 = 4; + +const TILE_SCALE_X: f32 = 1.0 / TILE_WIDTH as f32; +const TILE_SCALE_Y: f32 = 1.0 / TILE_HEIGHT as f32; + +/// This is just Line but f32 +#[derive(Clone, Copy, Debug)] +#[repr(C)] +pub struct FlatLine { + // should these be vec2? + pub p0: [f32; 2], + pub p1: [f32; 2], +} + +impl FlatLine { + pub fn new(p0: [f32; 2], p1: [f32; 2]) -> Self { + Self { p0, p1 } + } +} + +#[derive(Clone, Copy, Debug)] +pub(crate) struct Vec2 { + pub x: f32, + pub y: f32, +} + +const TILE_SCALE: f32 = 8192.0; +// scale factor relative to unit square in tile +const FRAC_TILE_SCALE: f32 = 8192.0 * 4.0; + +fn scale_up(z: f32) -> u32 { + (z * FRAC_TILE_SCALE).round() as u32 +} + +impl Vec2 { + fn new(x: f32, y: f32) -> Self { + Self { x, y } + } + + fn from_array(xy: [f32; 2]) -> Self { + Self::new(xy[0], xy[1]) + } + + #[allow(unused, reason = "code might pack by hand")] + // Note: this assumes values in range. + fn pack(self) -> u32 { + // TODO: scale should depend on tile size + let x = (self.x * TILE_SCALE).round() as u32; + let y = (self.y * TILE_SCALE).round() as u32; + (y << 16) + x + } + + pub(crate) fn unpack(packed: u32) -> Self { + let x = (packed & 0xffff) as f32 * (1.0 / TILE_SCALE); + let y = (packed >> 16) as f32 * (1.0 / TILE_SCALE); + Self::new(x, y) + } +} + +impl std::ops::Add for Vec2 { + type Output = Self; + + fn add(self, rhs: Self) -> Self { + Self::new(self.x + rhs.x, self.y + rhs.y) + } +} + +impl std::ops::Sub for Vec2 { + type Output = Self; + + fn sub(self, rhs: Self) -> Self { + Self::new(self.x - rhs.x, self.y - rhs.y) + } +} + +impl std::ops::Mul for Vec2 { + type Output = Self; + + fn mul(self, rhs: f32) -> Self { + Self::new(self.x * rhs, self.y * rhs) + } +} + +fn span(a: f32, b: f32) -> u32 { + (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32 +} + +pub(crate) fn make_tiles(lines: &[FlatLine], tile_buf: &mut Vec) { + tile_buf.clear(); + for line in lines { + let p0 = Vec2::from_array(line.p0); + let p1 = Vec2::from_array(line.p1); + let s0 = p0 * TILE_SCALE_X; + let s1 = p1 * TILE_SCALE_Y; + let count_x = span(s0.x, s1.x); + let count_y = span(s0.y, s1.y); + let mut x = s0.x.floor(); + if s0.x == x && s1.x < x { + // s0.x is on right side of first tile + x -= 1.0; + } + let mut y = s0.y.floor(); + if s0.y == y && s1.y < y { + // s0.y is on bottom of first tile + y -= 1.0; + } + let xfrac0 = scale_up(s0.x - x); + let yfrac0 = scale_up(s0.y - y); + let packed0 = (yfrac0 << 16) + xfrac0; + // These could be replaced with <2 and the max(1.0) in span removed + if count_x == 1 { + let xfrac1 = scale_up(s1.x - x); + if count_y == 1 { + let yfrac1 = scale_up(s1.y - y); + let packed1 = (yfrac1 << 16) + xfrac1; + // 1x1 tile + tile_buf.push(Tile { + x: x as u16, + y: y as u16, + p0: packed0, + p1: packed1, + }); + } else { + // vertical column + let slope = (s1.x - s0.x) / (s1.y - s0.y); + let sign = (s1.y - s0.y).signum(); + let mut xclip0 = (s0.x - x) + (y - s0.y) * slope; + let yclip = if sign > 0.0 { + xclip0 += slope; + scale_up(1.0) + } else { + 0 + }; + let mut last_packed = packed0; + for i in 0..count_y - 1 { + let xclip = xclip0 + i as f32 * sign * slope; + let xfrac = scale_up(xclip).max(1); + let packed = (yclip << 16) + xfrac; + tile_buf.push(Tile { + x: x as u16, + y: (y + i as f32 * sign) as u16, + p0: last_packed, + p1: packed, + }); + // flip y between top and bottom of tile + last_packed = packed ^ ((FRAC_TILE_SCALE as u32) << 16); + } + let yfrac1 = scale_up(s1.y - (y + (count_y - 1) as f32 * sign)); + let packed1 = (yfrac1 << 16) + xfrac1; + + tile_buf.push(Tile { + x: x as u16, + y: (y + (count_y - 1) as f32 * sign) as u16, + p0: last_packed, + p1: packed1, + }); + } + } else if count_y == 1 { + // horizontal row + let slope = (s1.y - s0.y) / (s1.x - s0.x); + let sign = (s1.x - s0.x).signum(); + let mut yclip0 = (s0.y - y) + (x - s0.x) * slope; + let xclip = if sign > 0.0 { + yclip0 += slope; + scale_up(1.0) + } else { + 0 + }; + let mut last_packed = packed0; + for i in 0..count_x - 1 { + let yclip = yclip0 + i as f32 * sign * slope; + let yfrac = scale_up(yclip).max(1); + let packed = (yfrac << 16) + xclip; + tile_buf.push(Tile { + x: (x + i as f32 * sign) as u16, + y: y as u16, + p0: last_packed, + p1: packed, + }); + // flip x between left and right of tile + last_packed = packed ^ (FRAC_TILE_SCALE as u32); + } + let xfrac1 = scale_up(s1.x - (x + (count_x - 1) as f32 * sign)); + let yfrac1 = scale_up(s1.y - y); + let packed1 = (yfrac1 << 16) + xfrac1; + + tile_buf.push(Tile { + x: (x + (count_x - 1) as f32 * sign) as u16, + y: y as u16, + p0: last_packed, + p1: packed1, + }); + } else { + // general case + let recip_dx = 1.0 / (s1.x - s0.x); + let signx = (s1.x - s0.x).signum(); + let recip_dy = 1.0 / (s1.y - s0.y); + let signy = (s1.y - s0.y).signum(); + // t parameter for next intersection with a vertical grid line + let mut t_clipx = (x - s0.x) * recip_dx; + let xclip = if signx > 0.0 { + t_clipx += recip_dx; + scale_up(1.0) + } else { + 0 + }; + // t parameter for next intersection with a horizontal grid line + let mut t_clipy = (y - s0.y) * recip_dy; + let yclip = if signy > 0.0 { + t_clipy += recip_dy; + scale_up(1.0) + } else { + 0 + }; + let x1 = x + (count_x - 1) as f32 * signx; + let y1 = y + (count_y - 1) as f32 * signy; + let mut xi = x; + let mut yi = y; + let mut last_packed = packed0; + let mut count = 0; + while xi != x1 || yi != y1 { + count += 1; + if count == 400 { + panic!(); + } + if t_clipy < t_clipx { + // intersected with horizontal grid line + let x_intersect = s0.x + (s1.x - s0.x) * t_clipy - xi; + let xfrac = scale_up(x_intersect).max(1); // maybe should clamp? + let packed = (yclip << 16) + xfrac; + tile_buf.push(Tile { + x: xi as u16, + y: yi as u16, + p0: last_packed, + p1: packed, + }); + t_clipy += recip_dy.abs(); + yi += signy; + last_packed = packed ^ ((FRAC_TILE_SCALE as u32) << 16); + } else { + // intersected with vertical grid line + let y_intersect = s0.y + (s1.y - s0.y) * t_clipx - yi; + let yfrac = scale_up(y_intersect).max(1); // maybe should clamp? + let packed = (yfrac << 16) + xclip; + tile_buf.push(Tile { + x: xi as u16, + y: yi as u16, + p0: last_packed, + p1: packed, + }); + t_clipx += recip_dx.abs(); + xi += signx; + last_packed = packed ^ (FRAC_TILE_SCALE as u32); + } + } + let xfrac1 = scale_up(s1.x - xi); + let yfrac1 = scale_up(s1.y - yi); + let packed1 = (yfrac1 << 16) + xfrac1; + + tile_buf.push(Tile { + x: xi as u16, + y: yi as u16, + p0: last_packed, + p1: packed1, + }); + } + } + // This particular choice of sentinel tiles generates a sentinel strip. + tile_buf.push(Tile { + x: 0x3ffd, + y: 0x3fff, + p0: 0, + p1: 0, + }); + tile_buf.push(Tile { + x: 0x3fff, + y: 0x3fff, + p0: 0, + p1: 0, + }); +} + +#[test] +fn tiling() { + let l = FlatLine { + p0: [1.3, 1.4], + p1: [20.1, 50.2], + }; + let mut tiles = vec![]; + make_tiles(&[l], &mut tiles); + for tile in &tiles { + let p0 = Vec2::unpack(tile.p0); + let p1 = Vec2::unpack(tile.p1); + println!( + "@{}, {}: ({}, {}) - ({}, {})", + tile.x, tile.y, p0.x, p0.y, p1.x, p1.y + ); + } +} diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs new file mode 100644 index 00000000..fe5296a9 --- /dev/null +++ b/vello_hybrid/src/wide_tile.rs @@ -0,0 +1,57 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use vello_api::peniko::color::{AlphaColor, Srgb}; + +pub(crate) const WIDE_TILE_WIDTH: usize = 256; +pub(crate) const STRIP_HEIGHT: usize = 4; + +pub(crate) struct WideTile { + pub(crate) bg: AlphaColor, + pub(crate) cmds: Vec, +} + +#[derive(Debug)] +pub(crate) enum Cmd { + Fill(CmdFill), + Strip(CmdStrip), +} + +#[derive(Debug)] +pub(crate) struct CmdFill { + pub(crate) x: u32, + pub(crate) width: u32, + pub(crate) color: AlphaColor, +} + +#[derive(Debug)] +pub(crate) struct CmdStrip { + pub(crate) x: u32, + pub(crate) width: u32, + pub(crate) alpha_ix: usize, + pub(crate) color: AlphaColor, +} + +impl Default for WideTile { + fn default() -> Self { + Self { + bg: AlphaColor::TRANSPARENT, + cmds: vec![], + } + } +} + +impl WideTile { + pub(crate) fn fill(&mut self, x: u32, width: u32, color: AlphaColor) { + if x == 0 && width == WIDE_TILE_WIDTH as u32 && color.components[3] == 1.0 { + self.cmds.clear(); + self.bg = color; + } else { + self.cmds.push(Cmd::Fill(CmdFill { x, width, color })); + } + } + + pub(crate) fn push(&mut self, cmd: Cmd) { + self.cmds.push(cmd); + } +} From eb791309b892958a8bfe5db0e31fc0ffb490291b Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Mon, 17 Feb 2025 20:16:09 -0800 Subject: [PATCH 2/5] Fix lints in non-aarch64 cfg's --- vello_hybrid/src/simd.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vello_hybrid/src/simd.rs b/vello_hybrid/src/simd.rs index c2a2614e..2213b558 100644 --- a/vello_hybrid/src/simd.rs +++ b/vello_hybrid/src/simd.rs @@ -59,13 +59,13 @@ pub(crate) fn render_strips(tiles: &[Tile], strip_buf: &mut Vec, alpha_bu } #[cfg(not(target_arch = "aarch64"))] -pub fn render_strips(tiles: &[Tile], strip_buf: &mut Vec, alpha_buf: &mut Vec) { +pub(crate) fn render_strips(tiles: &[Tile], strip_buf: &mut Vec, alpha_buf: &mut Vec) { crate::strip::render_strips_scalar(tiles, strip_buf, alpha_buf); } // This block is the fallback, no SIMD #[cfg(not(target_arch = "aarch64"))] -impl<'a> Fine<'a> { +impl Fine<'_> { pub(crate) fn pack(&mut self, x: usize, y: usize) { self.pack_scalar(x, y); } From 23a973d14b0b88a74192a0d0a54ccf45327c2506 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Tue, 18 Feb 2025 18:06:21 -0800 Subject: [PATCH 3/5] Start wiring up GPU render pipeline Renders a simple scene to the GPU, first by doing coarse rasterization the same as cpu-sparse, then doing a single draw call. --- Cargo.lock | 471 ++++++++++++++++++++++++++------ vello_api/Cargo.toml | 2 +- vello_hybrid/Cargo.toml | 9 + vello_hybrid/examples/gpu.rs | 176 ++++++++++++ vello_hybrid/shader/render.wgsl | 74 +++++ vello_hybrid/src/lib.rs | 5 + vello_hybrid/src/render.rs | 8 +- vello_hybrid/src/wide_tile.rs | 1 + 8 files changed, 657 insertions(+), 89 deletions(-) create mode 100644 vello_hybrid/examples/gpu.rs create mode 100644 vello_hybrid/shader/render.wgsl diff --git a/Cargo.lock b/Cargo.lock index e1b746b7..61299d27 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -46,6 +46,27 @@ dependencies = [ "memchr", ] +[[package]] +name = "android-activity" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee91c0c2905bae44f84bfa4e044536541df26b7703fd0888deeb9060fcc44289" +dependencies = [ + "android-properties", + "bitflags 2.8.0", + "cc", + "cesu8", + "jni", + "jni-sys", + "libc", + "log", + "ndk 0.8.0", + "ndk-context", + "ndk-sys 0.5.0+25.2.9519653", + "num_enum", + "thiserror 1.0.69", +] + [[package]] name = "android-activity" version = "0.6.0" @@ -60,7 +81,7 @@ dependencies = [ "jni-sys", "libc", "log", - "ndk", + "ndk 0.9.0", "ndk-context", "ndk-sys 0.6.0+11769913", "num_enum", @@ -191,6 +212,25 @@ dependencies = [ "libloading", ] +[[package]] +name = "async-executor" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec" +dependencies = [ + "async-task", + "concurrent-queue", + "fastrand", + "futures-lite", + "slab", +] + +[[package]] +name = "async-task" +version = "4.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de" + [[package]] name = "atomic-waker" version = "1.1.2" @@ -245,13 +285,32 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a" +[[package]] +name = "block-sys" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae85a0696e7ea3b835a453750bf002770776609115e6d25c6d2ff28a8200f7e7" +dependencies = [ + "objc-sys", +] + +[[package]] +name = "block2" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15b55663a85f33501257357e6421bb33e769d5c9ffb5ba0921c975a123e35e68" +dependencies = [ + "block-sys", + "objc2 0.4.1", +] + [[package]] name = "block2" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2c132eebf10f5cad5289222520a4a058514204aed6d791f1cf4fe8088b82d15f" dependencies = [ - "objc2", + "objc2 0.5.2", ] [[package]] @@ -298,6 +357,20 @@ version = "1.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b" +[[package]] +name = "calloop" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fba7adb4dd5aa98e5553510223000e7148f621165ec5f9acd7113f6ca4995298" +dependencies = [ + "bitflags 2.8.0", + "log", + "polling", + "rustix", + "slab", + "thiserror 1.0.69", +] + [[package]] name = "calloop" version = "0.13.0" @@ -312,13 +385,25 @@ dependencies = [ "thiserror 1.0.69", ] +[[package]] +name = "calloop-wayland-source" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f0ea9b9476c7fad82841a8dbb380e2eae480c21910feba80725b46931ed8f02" +dependencies = [ + "calloop 0.12.4", + "rustix", + "wayland-backend", + "wayland-client", +] + [[package]] name = "calloop-wayland-source" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "95a66a987056935f7efce4ab5668920b5d0dac4a7c99991a67395f13702ddd20" dependencies = [ - "calloop", + "calloop 0.13.0", "rustix", "wayland-backend", "wayland-client", @@ -359,6 +444,12 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "cfg_aliases" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e" + [[package]] name = "cfg_aliases" version = "0.2.1" @@ -424,14 +515,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "color" -version = "0.1.0" -source = "git+https://github.com/linebender/color.git?rev=a4fa61aff6c3f292b729dc409e7832e5f0166e4a#a4fa61aff6c3f292b729dc409e7832e5f0166e4a" -dependencies = [ - "serde", -] - [[package]] name = "color" version = "0.2.3" @@ -796,6 +879,25 @@ dependencies = [ "parking_lot", ] +[[package]] +name = "futures-io" +version = "0.3.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6" + +[[package]] +name = "futures-lite" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f5edaec856126859abb19ed65f39e90fea3a9574b9707f13539acf4abf7eb532" +dependencies = [ + "fastrand", + "futures-core", + "futures-io", + "parking", + "pin-project-lite", +] + [[package]] name = "gethostname" version = "0.4.3" @@ -969,7 +1071,7 @@ dependencies = [ "env_logger", "futures-intrusive", "png", - "pollster", + "pollster 0.4.0", "scenes", "vello", ] @@ -998,6 +1100,17 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "icrate" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d3aaff8a54577104bafdf686ff18565c3b6903ca5782a2026ef06e2c7aa319" +dependencies = [ + "block2 0.3.0", + "dispatch", + "objc2 0.4.1", +] + [[package]] name = "id-arena" version = "2.2.1" @@ -1300,7 +1413,7 @@ dependencies = [ "arrayvec", "bit-set", "bitflags 2.8.0", - "cfg_aliases", + "cfg_aliases 0.2.1", "codespan-reporting", "hexf-parse", "indexmap 2.7.0", @@ -1313,6 +1426,21 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "ndk" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2076a31b7010b17a38c01907c45b945e8f11495ee4dd588309718901b1f7a5b7" +dependencies = [ + "bitflags 2.8.0", + "jni-sys", + "log", + "ndk-sys 0.5.0+25.2.9519653", + "num_enum", + "raw-window-handle", + "thiserror 1.0.69", +] + [[package]] name = "ndk" version = "0.9.0" @@ -1453,6 +1581,16 @@ version = "0.3.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdb91bdd390c7ce1a8607f35f3ca7151b65afc0ff5ff3b34fa350f7d7c7e4310" +[[package]] +name = "objc2" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "559c5a40fdd30eb5e344fbceacf7595a81e242529fb4e21cf5f43fb4f11ff98d" +dependencies = [ + "objc-sys", + "objc2-encode 3.0.0", +] + [[package]] name = "objc2" version = "0.5.2" @@ -1460,7 +1598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46a785d4eeff09c14c487497c162e92766fbb3e4059a71840cecc03d9a50b804" dependencies = [ "objc-sys", - "objc2-encode", + "objc2-encode 4.0.3", ] [[package]] @@ -1470,9 +1608,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e4e89ad9e3d7d297152b17d39ed92cd50ca8063a89a9fa569046d41568891eff" dependencies = [ "bitflags 2.8.0", - "block2", + "block2 0.5.1", "libc", - "objc2", + "objc2 0.5.2", "objc2-core-data", "objc2-core-image", "objc2-foundation", @@ -1486,8 +1624,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "74dd3b56391c7a0596a295029734d3c1c5e7e510a4cb30245f8221ccea96b009" dependencies = [ "bitflags 2.8.0", - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-core-location", "objc2-foundation", ] @@ -1498,8 +1636,8 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a5ff520e9c33812fd374d8deecef01d4a840e7b41862d849513de77e44aa4889" dependencies = [ - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-foundation", ] @@ -1510,8 +1648,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "617fbf49e071c178c0b24c080767db52958f716d9eabdf0890523aeae54773ef" dependencies = [ "bitflags 2.8.0", - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-foundation", ] @@ -1521,8 +1659,8 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "55260963a527c99f1819c4f8e3b47fe04f9650694ef348ffd2227e8196d34c80" dependencies = [ - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-foundation", "objc2-metal", ] @@ -1533,12 +1671,18 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "000cfee34e683244f284252ee206a27953279d370e309649dc3ee317b37e5781" dependencies = [ - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-contacts", "objc2-foundation", ] +[[package]] +name = "objc2-encode" +version = "3.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d079845b37af429bfe5dfa76e6d087d788031045b25cfc6fd898486fd9847666" + [[package]] name = "objc2-encode" version = "4.0.3" @@ -1552,10 +1696,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ee638a5da3799329310ad4cfa62fbf045d5f56e3ef5ba4149e7452dcf89d5a8" dependencies = [ "bitflags 2.8.0", - "block2", + "block2 0.5.1", "dispatch", "libc", - "objc2", + "objc2 0.5.2", ] [[package]] @@ -1564,8 +1708,8 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a1a1ae721c5e35be65f01a03b6d2ac13a54cb4fa70d8a5da293d7b0020261398" dependencies = [ - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-app-kit", "objc2-foundation", ] @@ -1577,8 +1721,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd0cba1276f6023976a406a14ffa85e1fdd19df6b0f737b063b95f6c8c7aadd6" dependencies = [ "bitflags 2.8.0", - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-foundation", ] @@ -1589,8 +1733,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e42bee7bff906b14b167da2bac5efe6b6a07e6f7c0a21a7308d40c960242dc7a" dependencies = [ "bitflags 2.8.0", - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-foundation", "objc2-metal", ] @@ -1601,7 +1745,7 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0a684efe3dec1b305badae1a28f6555f6ddd3bb2c2267896782858d5a78404dc" dependencies = [ - "objc2", + "objc2 0.5.2", "objc2-foundation", ] @@ -1612,8 +1756,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b8bb46798b20cd6b91cbd113524c490f1686f4c4e8f49502431415f3512e2b6f" dependencies = [ "bitflags 2.8.0", - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-cloud-kit", "objc2-core-data", "objc2-core-image", @@ -1632,8 +1776,8 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "44fa5f9748dbfe1ca6c0b79ad20725a11eca7c2218bceb4b005cb1be26273bfe" dependencies = [ - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-foundation", ] @@ -1644,8 +1788,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76cfcbf642358e8689af64cee815d139339f3ed8ad05103ed5eaf73db8d84cb3" dependencies = [ "bitflags 2.8.0", - "block2", - "objc2", + "block2 0.5.1", + "objc2 0.5.2", "objc2-core-location", "objc2-foundation", ] @@ -1683,6 +1827,12 @@ dependencies = [ "ttf-parser", ] +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + [[package]] name = "parking_lot" version = "0.12.3" @@ -1712,23 +1862,13 @@ version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" -[[package]] -name = "peniko" -version = "0.2.0" -source = "git+https://github.com/linebender/peniko?rev=aeded39#aeded39d0ea2fccf7db598c83b83748f3b88195f" -dependencies = [ - "color 0.1.0", - "kurbo", - "smallvec", -] - [[package]] name = "peniko" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2ec061f435b573ff602b2c5690f1a5bfcf461327194654cfbfce4412b95cf2a1" dependencies = [ - "color 0.2.3", + "color", "kurbo", "smallvec", ] @@ -1805,6 +1945,12 @@ dependencies = [ "windows-sys 0.59.0", ] +[[package]] +name = "pollster" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22686f4785f02a4fcc856d3b3bb19bf6c8160d103f7a99cc258bddd0251dc7f2" + [[package]] name = "pollster" version = "0.4.0" @@ -1955,6 +2101,15 @@ dependencies = [ "font-types", ] +[[package]] +name = "redox_syscall" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29" +dependencies = [ + "bitflags 1.3.2", +] + [[package]] name = "redox_syscall" version = "0.4.1" @@ -2086,7 +2241,7 @@ dependencies = [ "roxmltree 0.20.0", "skrifa", "vello", - "web-time", + "web-time 1.1.0", ] [[package]] @@ -2126,6 +2281,19 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "sctk-adwaita" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70b31447ca297092c5a9916fc3b955203157b37c19ca8edde4f52e9843e602c7" +dependencies = [ + "ab_glyph", + "log", + "memmap2", + "smithay-client-toolkit 0.18.1", + "tiny-skia", +] + [[package]] name = "sctk-adwaita" version = "0.10.1" @@ -2135,7 +2303,7 @@ dependencies = [ "ab_glyph", "log", "memmap2", - "smithay-client-toolkit", + "smithay-client-toolkit 0.19.2", "tiny-skia", ] @@ -2239,16 +2407,16 @@ name = "simple" version = "0.0.0" dependencies = [ "anyhow", - "pollster", + "pollster 0.4.0", "vello", - "winit", + "winit 0.30.8", ] [[package]] name = "simple_sdl2" version = "0.0.0" dependencies = [ - "pollster", + "pollster 0.4.0", "sdl2", "vello", ] @@ -2287,6 +2455,31 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" +[[package]] +name = "smithay-client-toolkit" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "922fd3eeab3bd820d76537ce8f582b1cf951eceb5475c28500c7457d9d17f53a" +dependencies = [ + "bitflags 2.8.0", + "calloop 0.12.4", + "calloop-wayland-source 0.2.0", + "cursor-icon", + "libc", + "log", + "memmap2", + "rustix", + "thiserror 1.0.69", + "wayland-backend", + "wayland-client", + "wayland-csd-frame", + "wayland-cursor", + "wayland-protocols 0.31.2", + "wayland-protocols-wlr 0.2.0", + "wayland-scanner", + "xkeysym", +] + [[package]] name = "smithay-client-toolkit" version = "0.19.2" @@ -2294,8 +2487,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3457dea1f0eb631b4034d61d4d8c32074caa6cd1ab2d59f2327bd8461e2c0016" dependencies = [ "bitflags 2.8.0", - "calloop", - "calloop-wayland-source", + "calloop 0.13.0", + "calloop-wayland-source 0.3.0", "cursor-icon", "libc", "log", @@ -2306,8 +2499,8 @@ dependencies = [ "wayland-client", "wayland-csd-frame", "wayland-cursor", - "wayland-protocols", - "wayland-protocols-wlr", + "wayland-protocols 0.32.5", + "wayland-protocols-wlr 0.3.5", "wayland-scanner", "xkeysym", ] @@ -2616,7 +2809,7 @@ dependencies = [ "bytemuck", "futures-intrusive", "log", - "peniko 0.3.1", + "peniko", "png", "skrifa", "static_assertions", @@ -2631,7 +2824,7 @@ dependencies = [ name = "vello_api" version = "0.1.0" dependencies = [ - "peniko 0.2.0", + "peniko", ] [[package]] @@ -2640,7 +2833,7 @@ version = "0.4.0" dependencies = [ "bytemuck", "guillotiere", - "peniko 0.3.1", + "peniko", "skrifa", "smallvec", ] @@ -2649,10 +2842,15 @@ dependencies = [ name = "vello_hybrid" version = "0.1.0" dependencies = [ + "async-executor", + "bytemuck", "flatten", "png", + "pollster 0.3.0", "roxmltree 0.20.0", "vello_api", + "wgpu", + "winit 0.29.15", ] [[package]] @@ -2674,7 +2872,7 @@ dependencies = [ "image", "nv-flip", "png", - "pollster", + "pollster 0.4.0", "scenes", "vello", ] @@ -2966,6 +3164,18 @@ dependencies = [ "xcursor", ] +[[package]] +name = "wayland-protocols" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f81f365b8b4a97f422ac0e8737c438024b5951734506b0e1d775c73030561f4" +dependencies = [ + "bitflags 2.8.0", + "wayland-backend", + "wayland-client", + "wayland-scanner", +] + [[package]] name = "wayland-protocols" version = "0.32.5" @@ -2978,6 +3188,19 @@ dependencies = [ "wayland-scanner", ] +[[package]] +name = "wayland-protocols-plasma" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23803551115ff9ea9bce586860c5c5a971e360825a0309264102a9495a5ff479" +dependencies = [ + "bitflags 2.8.0", + "wayland-backend", + "wayland-client", + "wayland-protocols 0.31.2", + "wayland-scanner", +] + [[package]] name = "wayland-protocols-plasma" version = "0.3.5" @@ -2987,7 +3210,20 @@ dependencies = [ "bitflags 2.8.0", "wayland-backend", "wayland-client", - "wayland-protocols", + "wayland-protocols 0.32.5", + "wayland-scanner", +] + +[[package]] +name = "wayland-protocols-wlr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad1f61b76b6c2d8742e10f9ba5c3737f6530b4c243132c2a2ccc8aa96fe25cd6" +dependencies = [ + "bitflags 2.8.0", + "wayland-backend", + "wayland-client", + "wayland-protocols 0.31.2", "wayland-scanner", ] @@ -3000,7 +3236,7 @@ dependencies = [ "bitflags 2.8.0", "wayland-backend", "wayland-client", - "wayland-protocols", + "wayland-protocols 0.32.5", "wayland-scanner", ] @@ -3037,6 +3273,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "web-time" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa30049b1c872b72c89866d458eae9f20380ab280ffd1b1e18df2d3e2d98cfe0" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "web-time" version = "1.1.0" @@ -3055,7 +3301,7 @@ checksum = "47f55718f85c2fa756edffa0e7f0e0a60aba463d1362b57e23123c58f035e4b6" dependencies = [ "arrayvec", "bitflags 2.8.0", - "cfg_aliases", + "cfg_aliases 0.2.1", "document-features", "js-sys", "log", @@ -3082,7 +3328,7 @@ dependencies = [ "arrayvec", "bit-vec", "bitflags 2.8.0", - "cfg_aliases", + "cfg_aliases 0.2.1", "document-features", "indexmap 2.7.0", "log", @@ -3111,7 +3357,7 @@ dependencies = [ "bitflags 2.8.0", "block", "bytemuck", - "cfg_aliases", + "cfg_aliases 0.2.1", "core-graphics-types", "glow", "glutin_wgl_sys", @@ -3249,6 +3495,15 @@ dependencies = [ "windows-targets 0.42.2", ] +[[package]] +name = "windows-sys" +version = "0.48.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" +dependencies = [ + "windows-targets 0.48.5", +] + [[package]] name = "windows-sys" version = "0.52.0" @@ -3445,6 +3700,54 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "winit" +version = "0.29.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d59ad965a635657faf09c8f062badd885748428933dad8e8bdd64064d92e5ca" +dependencies = [ + "ahash", + "android-activity 0.5.2", + "atomic-waker", + "bitflags 2.8.0", + "bytemuck", + "calloop 0.12.4", + "cfg_aliases 0.1.1", + "core-foundation", + "core-graphics", + "cursor-icon", + "icrate", + "js-sys", + "libc", + "log", + "memmap2", + "ndk 0.8.0", + "ndk-sys 0.5.0+25.2.9519653", + "objc2 0.4.1", + "once_cell", + "orbclient", + "percent-encoding", + "raw-window-handle", + "redox_syscall 0.3.5", + "rustix", + "sctk-adwaita 0.8.3", + "smithay-client-toolkit 0.18.1", + "smol_str", + "unicode-segmentation", + "wasm-bindgen", + "wasm-bindgen-futures", + "wayland-backend", + "wayland-client", + "wayland-protocols 0.31.2", + "wayland-protocols-plasma 0.2.0", + "web-sys", + "web-time 0.2.4", + "windows-sys 0.48.0", + "x11-dl", + "x11rb", + "xkbcommon-dl", +] + [[package]] name = "winit" version = "0.30.8" @@ -3452,13 +3755,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f5d74280aabb958072864bff6cfbcf9025cf8bfacdde5e32b5e12920ef703b0f" dependencies = [ "ahash", - "android-activity", + "android-activity 0.6.0", "atomic-waker", "bitflags 2.8.0", - "block2", + "block2 0.5.1", "bytemuck", - "calloop", - "cfg_aliases", + "calloop 0.13.0", + "cfg_aliases 0.2.1", "concurrent-queue", "core-foundation", "core-graphics", @@ -3467,8 +3770,8 @@ dependencies = [ "js-sys", "libc", "memmap2", - "ndk", - "objc2", + "ndk 0.9.0", + "objc2 0.5.2", "objc2-app-kit", "objc2-foundation", "objc2-ui-kit", @@ -3478,8 +3781,8 @@ dependencies = [ "raw-window-handle", "redox_syscall 0.4.1", "rustix", - "sctk-adwaita", - "smithay-client-toolkit", + "sctk-adwaita 0.10.1", + "smithay-client-toolkit 0.19.2", "smol_str", "tracing", "unicode-segmentation", @@ -3487,10 +3790,10 @@ dependencies = [ "wasm-bindgen-futures", "wayland-backend", "wayland-client", - "wayland-protocols", - "wayland-protocols-plasma", + "wayland-protocols 0.32.5", + "wayland-protocols-plasma 0.3.5", "web-sys", - "web-time", + "web-time 1.1.0", "windows-sys 0.52.0", "x11-dl", "x11rb", @@ -3529,7 +3832,7 @@ dependencies = [ "kurbo", "log", "notify-debouncer-full", - "pollster", + "pollster 0.4.0", "profiling", "scenes", "tracing", @@ -3540,9 +3843,9 @@ dependencies = [ "wasm-bindgen", "wasm-bindgen-futures", "web-sys", - "web-time", + "web-time 1.1.0", "wgpu-profiler", - "winit", + "winit 0.30.8", ] [[package]] diff --git a/vello_api/Cargo.toml b/vello_api/Cargo.toml index ca133d96..e79892ba 100644 --- a/vello_api/Cargo.toml +++ b/vello_api/Cargo.toml @@ -20,4 +20,4 @@ targets = [] workspace = true [dependencies] -peniko = { git = "https://github.com/linebender/peniko", rev = "aeded39" } +peniko = "0.3.1" diff --git a/vello_hybrid/Cargo.toml b/vello_hybrid/Cargo.toml index 2656de48..b41b1f9d 100644 --- a/vello_hybrid/Cargo.toml +++ b/vello_hybrid/Cargo.toml @@ -23,6 +23,15 @@ workspace = true vello_api = { path = "../vello_api" } flatten = { git = "https://github.com/linebender/gpu-stroke-expansion-paper", rev = "827ccf6" } +# The following dependencies are for GPU. Maybe they should be optional, or the crate +# structure refactored so there's a CPU-only renderer that doesn't depend on these. + +wgpu = { version = "24.0.1" } +winit = "0.29" +pollster = "0.3" +async-executor = "1.8" +bytemuck = { version = "1.14", features = ["derive"] } + [dev-dependencies] png = "0.17.14" roxmltree = "0.20.0" diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs new file mode 100644 index 00000000..d5d9fe51 --- /dev/null +++ b/vello_hybrid/examples/gpu.rs @@ -0,0 +1,176 @@ +// Copyright 2025 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! Example program for CPU/GPU hybrid rendering. + +use std::sync::Arc; + +use bytemuck::{Pod, Zeroable}; +use vello_api::peniko::{color::palette, kurbo::{BezPath, Stroke}}; +use vello_hybrid::{GpuRenderCtx, GpuSession}; +use wgpu::util::DeviceExt; + +use winit::{ + event::{Event, WindowEvent}, + event_loop::EventLoop, + window::Window, +}; + +#[repr(C)] +#[derive(Copy, Clone, Pod, Zeroable)] +struct Config { + width: u32, + height: u32, + strip_height: u32, +} + +async fn run(event_loop: EventLoop<()>, window: Window) { + let window = Arc::new(window); + let window_clone = window.clone(); + let instance = wgpu::Instance::new(&Default::default()); + let surface = instance.create_surface(&window).unwrap(); + let adapter = instance + .request_adapter(&wgpu::RequestAdapterOptions { + power_preference: Default::default(), + force_fallback_adapter: false, + compatible_surface: Some(&surface), + }) + .await + .expect("error finding adapter"); + + let (device, queue) = adapter + .request_device(&Default::default(), None) + .await + .expect("error creating device"); + let size = window.inner_size(); + let swapchain_capabilities = surface.get_capabilities(&adapter); + let format = swapchain_capabilities.formats[0]; + let sc = wgpu::SurfaceConfiguration { + usage: wgpu::TextureUsages::RENDER_ATTACHMENT, + format, + width: size.width, + height: size.height, + present_mode: wgpu::PresentMode::Fifo, + alpha_mode: swapchain_capabilities.alpha_modes[0], + view_formats: vec![], + desired_maximum_frame_latency: 2, + }; + surface.configure(&device, &sc); + + let session = GpuSession::new(&device, format); + // TODO: actually render something + let mut render_ctx = GpuRenderCtx::new(size.width as usize, size.height as usize); + draw_simple_scene(&mut render_ctx); + let bufs = render_ctx.harvest(); + + let config = Config { + width: size.width, + height: size.height, + strip_height: 4, + }; + + let config_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("config"), + contents: bytemuck::bytes_of(&config), + usage: wgpu::BufferUsages::UNIFORM, + }); + + let strip_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("strip"), + contents: bytemuck::cast_slice(&bufs.strips), + usage: wgpu::BufferUsages::STORAGE, + }); + let alpha_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor { + label: Some("alpha"), + contents: bytemuck::cast_slice(&bufs.alphas), + usage: wgpu::BufferUsages::STORAGE, + }); + + let render_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor { + label: None, + layout: &session.render_bind_group_layout, + entries: &[ + wgpu::BindGroupEntry { + binding: 0, + resource: alpha_buf.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 1, + resource: config_buf.as_entire_binding(), + }, + wgpu::BindGroupEntry { + binding: 2, + resource: strip_buf.as_entire_binding(), + }, + ], + }); + + event_loop + .run(move |event, target| { + if let Event::WindowEvent { + window_id: _, + event, + } = event + { + match event { + WindowEvent::RedrawRequested => { + let frame = surface + .get_current_texture() + .expect("error getting texture from swap chain"); + + let mut encoder = device.create_command_encoder(&Default::default()); + { + let view = frame + .texture + .create_view(&wgpu::TextureViewDescriptor::default()); + let mut rpass = + encoder.begin_render_pass(&wgpu::RenderPassDescriptor { + label: None, + color_attachments: &[Some(wgpu::RenderPassColorAttachment { + view: &view, + resolve_target: None, + ops: wgpu::Operations { + load: wgpu::LoadOp::Clear(wgpu::Color::BLACK), + store: wgpu::StoreOp::Store, + }, + })], + depth_stencil_attachment: None, + timestamp_writes: None, + occlusion_query_set: None, + }); + rpass.set_pipeline(&session.render_pipeline); + rpass.set_bind_group(0, &render_bind_group, &[]); + rpass.draw(0..4, 0..bufs.strips.len() as u32); + } + queue.submit(Some(encoder.finish())); + frame.present(); + window_clone.request_redraw(); + } + WindowEvent::CloseRequested => { + target.exit(); + } + _ => (), + } + } + }) + .unwrap(); +} + +fn draw_simple_scene(ctx: &mut GpuRenderCtx) { + let mut path = BezPath::new(); + path.move_to((10.0, 10.0)); + path.line_to((180.0, 20.0)); + path.line_to((30.0, 40.0)); + path.close_path(); + let piet_path = path.into(); + ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into()); + let stroke = Stroke::new(5.0); + ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into()); +} + +fn main() { + let event_loop = EventLoop::new().unwrap(); + let window = Window::new(&event_loop).unwrap(); + window.set_resizable(false); + pollster::block_on(run(event_loop, window)); +} diff --git a/vello_hybrid/shader/render.wgsl b/vello_hybrid/shader/render.wgsl new file mode 100644 index 00000000..4b1ee1f6 --- /dev/null +++ b/vello_hybrid/shader/render.wgsl @@ -0,0 +1,74 @@ +// Copyright 2024 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +// A simple render pipeline for solid color sparse strip rendering. + +// Each instance draws one strip consisting of alpha values (dense_width) +// then a solid region. + +struct VertexOutput { + @location(0) tex_coord: vec2, + @location(1) @interpolate(flat) dense_end: u32, + @location(2) @interpolate(flat) color: u32, + @builtin(position) position: vec4, +}; + +struct Config { + width: u32, + height: u32, + strip_height: u32, +} + +struct Strip { + xy: u32, // this could be u16's on the Rust side + // [width, dense_width] packed as u16's + widths: u32, + col: u32, + rgba: u32, +} + +@group(0) @binding(1) +var config: Config; + +@group(0) @binding(2) +var strips: array; + +@vertex +fn vs_main( + @builtin(vertex_index) in_vertex_index: u32, + @builtin(instance_index) in_instance_index: u32 +) -> VertexOutput { + var out: VertexOutput; + let x = f32(in_vertex_index & 1u); + let y = f32(in_vertex_index >> 1u); + let strip = strips[in_instance_index]; + let next_strip = strips[in_instance_index + 1u]; + let x0 = strip.xy & 0xffffu; + let y0 = strip.xy >> 16u; + let width = strip.widths & 0xffffu; + let dense_width = strip.widths >> 16u; + out.dense_end = strip.col + dense_width; + let pix_x = f32(x0) + f32(width) * x; + let pix_y = f32(y0) + y * f32(config.strip_height); + let gl_x = (pix_x + 0.5) * 2.0 / f32(config.width) - 1.0; + let gl_y = 1.0 - (pix_y + 0.5) * 2.0 / f32(config.height); + out.position = vec4(gl_x, gl_y, 0.0, 1.0); + out.tex_coord = vec2(f32(strip.col) + x * f32(width), y * f32(config.strip_height)); + out.color = strip.rgba; + return out; +} + +@group(0) @binding(0) +var alphas: array; + +@fragment +fn fs_main(in: VertexOutput) -> @location(0) vec4 { + let x = u32(floor(in.tex_coord.x)); + var alpha = 1.0; + if x < in.dense_end { + let y = u32(floor(in.tex_coord.y)); + let a = alphas[x]; + alpha = f32((a >> (y * 8u)) & 0xffu) * (1.0 / 255.0); + } + return alpha * unpack4x8unorm(in.color); +} diff --git a/vello_hybrid/src/lib.rs b/vello_hybrid/src/lib.rs index 341c6e90..fdfedf89 100644 --- a/vello_hybrid/src/lib.rs +++ b/vello_hybrid/src/lib.rs @@ -8,6 +8,7 @@ mod fine; mod flatten; +mod gpu; mod pixmap; mod render; mod simd; @@ -15,6 +16,10 @@ mod strip; mod tiling; mod wide_tile; +pub use gpu::{GpuRenderBufs, GpuRenderCtx, GpuSession}; pub use pixmap::Pixmap; pub use render::{CsRenderCtx, CsResourceCtx}; pub use tiling::FlatLine; + +// TODO: this export should be removed, buffer upload will be internal +pub use gpu::Strip; diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs index 8216ef5a..1c7bfa67 100644 --- a/vello_hybrid/src/render.rs +++ b/vello_hybrid/src/render.rs @@ -24,10 +24,10 @@ use crate::{ }; pub struct CsRenderCtx { - width: usize, - height: usize, - tiles: Vec, - alphas: Vec, + pub(crate) width: usize, + pub(crate) height: usize, + pub(crate) tiles: Vec, + pub(crate) alphas: Vec, /// These are all scratch buffers, to be used for path rendering. They're here solely /// so the allocations can be reused. diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs index fe5296a9..5e71153e 100644 --- a/vello_hybrid/src/wide_tile.rs +++ b/vello_hybrid/src/wide_tile.rs @@ -21,6 +21,7 @@ pub(crate) enum Cmd { pub(crate) struct CmdFill { pub(crate) x: u32, pub(crate) width: u32, + // TODO: Probably want this pre-packed to u32 to avoid packing cost pub(crate) color: AlphaColor, } From cdeecc758abeca4f949ab97961915079f95b691f Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Wed, 19 Feb 2025 12:24:04 -0800 Subject: [PATCH 4/5] Add missing file, fix lints --- vello_hybrid/examples/gpu.rs | 12 +- vello_hybrid/src/gpu.rs | 212 +++++++++++++++++++++++++++++++++++ 2 files changed, 220 insertions(+), 4 deletions(-) create mode 100644 vello_hybrid/src/gpu.rs diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs index d5d9fe51..eeb6a1ba 100644 --- a/vello_hybrid/examples/gpu.rs +++ b/vello_hybrid/examples/gpu.rs @@ -6,7 +6,10 @@ use std::sync::Arc; use bytemuck::{Pod, Zeroable}; -use vello_api::peniko::{color::palette, kurbo::{BezPath, Stroke}}; +use vello_api::peniko::{ + color::palette, + kurbo::{BezPath, Stroke}, +}; use vello_hybrid::{GpuRenderCtx, GpuSession}; use wgpu::util::DeviceExt; @@ -109,10 +112,10 @@ async fn run(event_loop: EventLoop<()>, window: Window) { .run(move |event, target| { if let Event::WindowEvent { window_id: _, - event, + event: window_event, } = event { - match event { + match window_event { WindowEvent::RedrawRequested => { let frame = surface .get_current_texture() @@ -140,7 +143,8 @@ async fn run(event_loop: EventLoop<()>, window: Window) { }); rpass.set_pipeline(&session.render_pipeline); rpass.set_bind_group(0, &render_bind_group, &[]); - rpass.draw(0..4, 0..bufs.strips.len() as u32); + let n_strips = bufs.strips.len().try_into().expect("too many strips"); + rpass.draw(0..4, 0..n_strips); } queue.submit(Some(encoder.finish())); frame.present(); diff --git a/vello_hybrid/src/gpu.rs b/vello_hybrid/src/gpu.rs new file mode 100644 index 00000000..b4ed688a --- /dev/null +++ b/vello_hybrid/src/gpu.rs @@ -0,0 +1,212 @@ +// Copyright 2025 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +//! The GPU parts of a hybrid CPU/GPU rendering engine. + +use bytemuck::{Pod, Zeroable}; +use vello_api::{peniko::BrushRef, RenderCtx}; +use wgpu::{ + BindGroupLayout, BlendState, ColorTargetState, ColorWrites, Device, PipelineCompilationOptions, + RenderPipeline, TextureFormat, +}; + +use crate::{ + wide_tile::{Cmd, STRIP_HEIGHT, WIDE_TILE_WIDTH}, + CsRenderCtx, +}; + +/// Resources common to GPU renders. +pub struct GpuSession { + pub render_bind_group_layout: BindGroupLayout, + pub render_pipeline: RenderPipeline, +} + +#[repr(C)] +#[derive(Debug, Clone, Copy, Zeroable, Pod)] +pub struct Strip { + x: u16, + y: u16, + width: u16, + dense_width: u16, + col: u32, + rgba: u32, +} + +/// A render context for a single frame. +/// +/// This will eventually get a `RenderCtx` trait impl. +pub struct GpuRenderCtx { + // At the moment, we take the entire cpu-sparse render context, + // but we might split that up. + inner: CsRenderCtx, +} + +/// The buffers from a render. +/// +/// This being a struct is based on a model where all the buffers are uploaded +/// up front. That will be replaced by the "submit early and often" model. +pub struct GpuRenderBufs { + pub strips: Vec, + pub alphas: Vec, +} + +impl GpuSession { + pub fn new(device: &Device, format: TextureFormat) -> Self { + let render_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor { + label: None, + source: wgpu::ShaderSource::Wgsl(include_str!("../shader/render.wgsl").into()), + }); + let render_bind_group_layout = + device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor { + label: None, + entries: &[ + wgpu::BindGroupLayoutEntry { + binding: 0, + visibility: wgpu::ShaderStages::FRAGMENT, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 1, + visibility: wgpu::ShaderStages::VERTEX, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Uniform, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + wgpu::BindGroupLayoutEntry { + binding: 2, + visibility: wgpu::ShaderStages::VERTEX, + ty: wgpu::BindingType::Buffer { + ty: wgpu::BufferBindingType::Storage { read_only: true }, + has_dynamic_offset: false, + min_binding_size: None, + }, + count: None, + }, + ], + }); + let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor { + label: None, + bind_group_layouts: &[&render_bind_group_layout], + push_constant_ranges: &[], + }); + let render_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor { + label: None, + layout: Some(&pipeline_layout), + vertex: wgpu::VertexState { + module: &render_shader, + entry_point: Some("vs_main"), + buffers: &[], + compilation_options: PipelineCompilationOptions::default(), + }, + fragment: Some(wgpu::FragmentState { + module: &render_shader, + entry_point: Some("fs_main"), + targets: &[Some(ColorTargetState { + format, + blend: Some(BlendState::PREMULTIPLIED_ALPHA_BLENDING), + write_mask: ColorWrites::ALL, + })], + compilation_options: PipelineCompilationOptions::default(), + }), + primitive: wgpu::PrimitiveState { + topology: wgpu::PrimitiveTopology::TriangleStrip, + ..Default::default() + }, + depth_stencil: None, + multisample: wgpu::MultisampleState::default(), + multiview: None, + cache: None, + }); + Self { + render_bind_group_layout, + render_pipeline, + } + } +} + +impl GpuRenderCtx { + pub fn new(width: usize, height: usize) -> Self { + Self { + inner: CsRenderCtx::new(width, height), + } + } + + pub fn harvest(&self) -> GpuRenderBufs { + let mut strips = Vec::new(); + let width_tiles = (self.inner.width).div_ceil(WIDE_TILE_WIDTH); + let height_tiles = (self.inner.height).div_ceil(STRIP_HEIGHT); + for y in 0..height_tiles { + for x in 0..width_tiles { + let tile = &self.inner.tiles[y * width_tiles + x]; + let tile_x = x * WIDE_TILE_WIDTH; + let tile_y = y * STRIP_HEIGHT; + let bg = tile.bg.to_rgba8().to_u32(); + if bg != 0 { + let strip = Strip { + x: tile_x as u16, + y: tile_y as u16, + width: WIDE_TILE_WIDTH as u16, + dense_width: 0, + col: 0, + rgba: bg, + }; + strips.push(strip); + } + for cmd in &tile.cmds { + match cmd { + Cmd::Fill(fill) => { + let strip = Strip { + x: (tile_x as u32 + fill.x) as u16, + y: tile_y as u16, + width: fill.width as u16, + dense_width: 0, + col: 0, + rgba: fill.color.to_rgba8().to_u32(), + }; + strips.push(strip); + } + Cmd::Strip(cmd_strip) => { + let strip = Strip { + x: (tile_x as u32 + cmd_strip.x) as u16, + y: tile_y as u16, + width: cmd_strip.width as u16, + dense_width: cmd_strip.width as u16, + col: cmd_strip.alpha_ix as u32, + rgba: cmd_strip.color.to_rgba8().to_u32(), + }; + strips.push(strip); + } + } + } + } + } + GpuRenderBufs { + strips, + alphas: self.inner.alphas.clone(), + } + } +} + +// This block will eventually turn into an impl of RenderCtx. +impl GpuRenderCtx { + pub fn fill(&mut self, path: &vello_api::Path, brush: BrushRef<'_>) { + self.inner.fill(path, brush); + } + + pub fn stroke( + &mut self, + path: &vello_api::Path, + stroke: &vello_api::peniko::kurbo::Stroke, + brush: BrushRef<'_>, + ) { + self.inner.stroke(path, stroke, brush); + } +} From 184dd5ba2bc5d535c78e131c536f0faa0157b626 Mon Sep 17 00:00:00 2001 From: Raph Levien Date: Thu, 20 Feb 2025 17:01:09 -0800 Subject: [PATCH 5/5] Implement basic clip logic Adds a clip method to the (CPU) render context, plus a considerable amount of mechanism in coarse and fine rasterization to support clipping. The coarse rasterization logic contains a similar set of optimizations as Vello. In particular, all-zero tiles have drawing suppressed, and all-one tiles pass drawing commands through with no additional work to clip. Not extensively validated, but it does render a simple scene with clipping correctly. --- vello_hybrid/examples/gpu.rs | 2 +- vello_hybrid/examples/simple.rs | 41 +++-- vello_hybrid/src/fine.rs | 58 +++++- vello_hybrid/src/gpu.rs | 4 +- vello_hybrid/src/render.rs | 304 +++++++++++++++++++++++++++++--- vello_hybrid/src/simd/neon.rs | 24 +-- vello_hybrid/src/wide_tile.rs | 104 ++++++++++- 7 files changed, 476 insertions(+), 61 deletions(-) diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs index eeb6a1ba..433823f2 100644 --- a/vello_hybrid/examples/gpu.rs +++ b/vello_hybrid/examples/gpu.rs @@ -61,7 +61,7 @@ async fn run(event_loop: EventLoop<()>, window: Window) { surface.configure(&device, &sc); let session = GpuSession::new(&device, format); - // TODO: actually render something + let mut render_ctx = GpuRenderCtx::new(size.width as usize, size.height as usize); draw_simple_scene(&mut render_ctx); let bufs = render_ctx.harvest(); diff --git a/vello_hybrid/examples/simple.rs b/vello_hybrid/examples/simple.rs index e91f0175..bae2c10e 100644 --- a/vello_hybrid/examples/simple.rs +++ b/vello_hybrid/examples/simple.rs @@ -7,24 +7,16 @@ use std::io::BufWriter; use vello_api::peniko::color::palette; -use vello_api::peniko::kurbo::{BezPath, Stroke}; +use vello_api::peniko::kurbo::{BezPath, Point, Stroke, Vec2}; use vello_api::RenderCtx; use vello_hybrid::{CsRenderCtx, Pixmap}; const WIDTH: usize = 1024; -const HEIGHT: usize = 256; +const HEIGHT: usize = 1024; pub fn main() { let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT); - let mut path = BezPath::new(); - path.move_to((10.0, 10.0)); - path.line_to((180.0, 20.0)); - path.line_to((30.0, 40.0)); - path.close_path(); - let piet_path = path.into(); - ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into()); - let stroke = Stroke::new(5.0); - ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into()); + draw_simple_scene(&mut ctx); if let Some(filename) = std::env::args().nth(1) { let mut pixmap = Pixmap::new(WIDTH, HEIGHT); ctx.render_to_pixmap(&mut pixmap); @@ -39,3 +31,30 @@ pub fn main() { ctx.debug_dump(); } } + +fn star(center: Point, n: usize, inner: f64, outer: f64) -> BezPath { + let mut path = BezPath::new(); + path.move_to(center + Vec2::new(outer, 0.)); + for i in 1..n * 2 { + let th = i as f64 * std::f64::consts::PI / n as f64; + let r = if i % 2 == 0 { outer } else { inner }; + path.line_to(center + r * Vec2::from_angle(th)); + } + path.close_path(); + path +} + +fn draw_simple_scene(ctx: &mut CsRenderCtx) { + let mut path = BezPath::new(); + path.move_to((10.0, 10.0)); + path.line_to((180.0, 20.0)); + path.line_to((30.0, 180.0)); + path.close_path(); + // Note: we plan to change the API to have `into`. + let piet_path = path.into(); + let stroke = Stroke::new(5.0); + ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into()); + let star_path = star(Point::new(100., 100.), 13, 50., 95.); + ctx.clip(&star_path.into()); + ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into()); +} diff --git a/vello_hybrid/src/fine.rs b/vello_hybrid/src/fine.rs index 4b935528..52b94b05 100644 --- a/vello_hybrid/src/fine.rs +++ b/vello_hybrid/src/fine.rs @@ -15,7 +15,7 @@ pub(crate) struct Fine<'a> { // f32 RGBA pixels // That said, if we use u8, then this is basically a block of // untyped memory. - pub(crate) scratch: [f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4], + pub(crate) scratch: Vec<[f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]>, #[allow(clippy::doc_markdown, reason = "false positive for x86_64")] /// Whether to use SIMD /// @@ -31,7 +31,7 @@ pub(crate) struct Fine<'a> { impl<'a> Fine<'a> { pub(crate) fn new(width: usize, height: usize, out_buf: &'a mut [u8]) -> Self { - let scratch = [0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]; + let scratch = vec![[0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]]; Self { width, height, @@ -42,7 +42,8 @@ impl<'a> Fine<'a> { } pub(crate) fn clear_scalar(&mut self, color: [f32; 4]) { - for z in self.scratch.chunks_exact_mut(4) { + let scratch = self.scratch.last_mut().unwrap(); + for z in scratch.chunks_exact_mut(4) { z.copy_from_slice(&color); } } @@ -58,12 +59,13 @@ impl<'a> Fine<'a> { (y + 1) * STRIP_HEIGHT <= self.height, "overflow of pixmap height" ); + let scratch = self.scratch.last_mut().unwrap(); let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4; for j in 0..STRIP_HEIGHT { let line_ix = base_ix + j * self.width * 4; for i in 0..WIDE_TILE_WIDTH { let mut rgba_f32 = [0.0; 4]; - rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]); + rgba_f32.copy_from_slice(&scratch[(i * STRIP_HEIGHT + j) * 4..][..4]); let rgba_u8 = rgba_f32.map(|z| (z * 255.0).round() as u8); self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8); } @@ -79,20 +81,28 @@ impl<'a> Fine<'a> { let aslice = &alphas[s.alpha_ix..]; self.strip(s.x as usize, s.width as usize, aslice, s.color.components); } + Cmd::PushClip => self.scratch.push([0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]), + Cmd::PopClip => _ = self.scratch.pop(), + Cmd::ClipFill(f) => { + self.clip_fill_scalar(f.x as usize, f.width as usize); + } + Cmd::ClipStrip(s) => { + let aslice = &alphas[s.alpha_ix..]; + self.clip_strip_scalar(s.x as usize, s.width as usize, aslice); + } } } pub(crate) fn fill_scalar(&mut self, x: usize, width: usize, color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); if color[3] == 1.0 { - for z in - self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) + for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) { z.copy_from_slice(&color); } } else { let one_minus_alpha = 1.0 - color[3]; - for z in - self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) + for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4) { for i in 0..4 { //z[i] = color[i] + one_minus_alpha * z[i]; @@ -106,9 +116,10 @@ impl<'a> Fine<'a> { } pub(crate) fn strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); debug_assert!(alphas.len() >= width, "overflow of alphas buffer"); let cs = color.map(|z| z * (1.0 / 255.0)); - for (z, a) in self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width] + for (z, a) in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width] .chunks_exact_mut(16) .zip(alphas) { @@ -121,4 +132,33 @@ impl<'a> Fine<'a> { } } } + + fn clip_fill_scalar(&mut self, x: usize, width: usize) { + let (tos, rest) = self.scratch.split_last_mut().unwrap(); + let nos = rest.last_mut().unwrap(); + for i in 0..width { + for j in 0..4 { + let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4; + let one_minus_alpha = 1.0 - tos[ix + 3]; + for k in 0..4 { + nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, tos[ix + k]); + } + } + } + } + + fn clip_strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32]) { + let (tos, rest) = self.scratch.split_last_mut().unwrap(); + let nos = rest.last_mut().unwrap(); + for (i, a) in alphas.iter().take(width).enumerate() { + for j in 0..4 { + let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4; + let mask_alpha = ((a >> (j * 8)) & 0xff) as f32 * (1. / 255.); + let one_minus_alpha = 1.0 - mask_alpha * tos[ix + 3]; + for k in 0..4 { + nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, mask_alpha * tos[ix + k]); + } + } + } + } } diff --git a/vello_hybrid/src/gpu.rs b/vello_hybrid/src/gpu.rs index b4ed688a..c5a1b652 100644 --- a/vello_hybrid/src/gpu.rs +++ b/vello_hybrid/src/gpu.rs @@ -139,7 +139,8 @@ impl GpuRenderCtx { } } - pub fn harvest(&self) -> GpuRenderBufs { + pub fn harvest(&mut self) -> GpuRenderBufs { + self.inner.finish(); let mut strips = Vec::new(); let width_tiles = (self.inner.width).div_ceil(WIDE_TILE_WIDTH); let height_tiles = (self.inner.height).div_ceil(STRIP_HEIGHT); @@ -184,6 +185,7 @@ impl GpuRenderCtx { }; strips.push(strip); } + _ => todo!(), } } } diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs index 1c7bfa67..7ecb8b27 100644 --- a/vello_hybrid/src/render.rs +++ b/vello_hybrid/src/render.rs @@ -19,7 +19,7 @@ use crate::{ fine::Fine, strip::{self, Strip, Tile}, tiling::{self, FlatLine}, - wide_tile::{Cmd, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH}, + wide_tile::{Cmd, CmdClipStrip, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH}, Pixmap, }; @@ -34,6 +34,22 @@ pub struct CsRenderCtx { line_buf: Vec, tile_buf: Vec, strip_buf: Vec, + + state_stack: Vec, + clip_stack: Vec, +} + +struct GfxState { + // TODO: transform goes here (there's logic in piet-ts to copy) + n_clip: usize, +} + +struct Clip { + // should probably be a bounding box type + /// The intersected bounding box after clip + clip_bbox: [usize; 4], + /// The rendered path in sparse strip representation + strips: Vec, } pub struct CsResourceCtx; @@ -45,18 +61,17 @@ impl CsRenderCtx { let tiles = (0..width_tiles * height_tiles) .map(|_| WideTile::default()) .collect(); - let alphas = vec![]; - let line_buf = vec![]; - let tile_buf = vec![]; - let strip_buf = vec![]; + let state = GfxState { n_clip: 0 }; Self { width, height, tiles, - alphas, - line_buf, - tile_buf, - strip_buf, + alphas: vec![], + line_buf: vec![], + tile_buf: vec![], + strip_buf: vec![], + state_stack: vec![state], + clip_stack: vec![], } } @@ -67,7 +82,16 @@ impl CsRenderCtx { } } - pub fn render_to_pixmap(&self, pixmap: &mut Pixmap) { + /// Finish the coarse rasterization prior to fine rendering. + /// + /// At the moment, this mostly involves resolving any open clips, but + /// might extend to other things. + pub(crate) fn finish(&mut self) { + self.pop_clips(); + } + + pub fn render_to_pixmap(&mut self, pixmap: &mut Pixmap) { + self.finish(); let mut fine = Fine::new(pixmap.width, pixmap.height, &mut pixmap.buf); let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); let height_tiles = (self.height).div_ceil(STRIP_HEIGHT); @@ -94,27 +118,43 @@ impl CsRenderCtx { println!("total = {total}, {histo:?}"); } + /// Render a path to the strip buffer. + fn render_path_common(&mut self) { + tiling::make_tiles(&self.line_buf, &mut self.tile_buf); + self.tile_buf.sort_unstable_by(Tile::cmp); + crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas); + } + /// Render a path, which has already been flattened into `line_buf`. fn render_path(&mut self, brush: BrushRef<'_>) { // TODO: need to make sure tiles contained in viewport - we'll likely // panic otherwise. - tiling::make_tiles(&self.line_buf, &mut self.tile_buf); - self.tile_buf.sort_unstable_by(Tile::cmp); - crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas); + self.render_path_common(); let color = brush_to_color(brush); let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH); + let bbox = self.get_bbox(); for i in 0..self.strip_buf.len() - 1 { let strip = &self.strip_buf[i]; let next_strip = &self.strip_buf[i + 1]; let x0 = strip.x(); - let y = strip.strip_y(); - let row_start = y as usize * width_tiles; + let y = strip.strip_y() as usize; + if y < bbox[1] { + continue; + } + if y >= bbox[3] { + break; + } + let row_start = y * width_tiles; let strip_width = next_strip.col - strip.col; let x1 = x0 + strip_width; - let xtile0 = x0 as usize / WIDE_TILE_WIDTH; - let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH); + let xtile0 = (x0 as usize / WIDE_TILE_WIDTH).max(bbox[0]); + let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]); let mut x = x0; let mut col = strip.col; + if (bbox[0] * WIDE_TILE_WIDTH) as u32 > x { + col += (bbox[0] * WIDE_TILE_WIDTH) as u32 - x; + x = (bbox[0] * WIDE_TILE_WIDTH) as u32; + } for xtile in xtile0..xtile1 { let x_tile_rel = x % WIDE_TILE_WIDTH as u32; let width = x1.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x; @@ -126,13 +166,13 @@ impl CsRenderCtx { }; x += width; col += width; - self.tiles[row_start + xtile].push(Cmd::Strip(cmd)); + self.tiles[row_start + xtile].strip(cmd); } - if next_strip.winding != 0 && y == next_strip.strip_y() { + if next_strip.winding != 0 && y == next_strip.strip_y() as usize { x = x1; let x2 = next_strip.x(); - let fxt0 = x1 as usize / WIDE_TILE_WIDTH; - let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH); + let fxt0 = (x1 as usize / WIDE_TILE_WIDTH).max(bbox[0]); + let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]); for xtile in fxt0..fxt1 { let x_tile_rel = x % WIDE_TILE_WIDTH as u32; let width = x2.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x; @@ -161,6 +201,133 @@ impl CsRenderCtx { // TODO: get from graphics state Affine::scale(5.0) } + + fn get_bbox(&self) -> [usize; 4] { + if let Some(tos) = self.clip_stack.last() { + tos.clip_bbox + } else { + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + let height_tiles = (self.height).div_ceil(STRIP_HEIGHT); + [0, 0, width_tiles, height_tiles] + } + } + + fn pop_clip(&mut self) { + self.state_stack.last_mut().unwrap().n_clip -= 1; + let Clip { clip_bbox, strips } = self.clip_stack.pop().unwrap(); + let n_strips = strips.len(); + // The next bit of code accomplishes the following. For each tile in + // the intersected bounding box, it does one of three things depending + // on the contents of the clip path in that tile. + // If all-zero: pop a zero_clip. + // If all-one: do nothing. + // If contains one or more strips: render strips and fills, then pop a clip. + // This logic is the inverse of the push logic in `clip()`, and the stack + // should be balanced after running both. + let mut tile_x = clip_bbox[0]; + let mut tile_y = clip_bbox[1]; + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + let mut pop_pending = false; + for i in 0..n_strips - 1 { + let strip = &strips[i]; + let y = strip.strip_y() as usize; + if y < tile_y { + continue; + } + while tile_y < y.min(clip_bbox[3]) { + if core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + tile_x += 1; + } + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].pop_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + if tile_y == clip_bbox[3] { + break; + } + let x0 = strip.x() as usize; + let x_clamped = (x0 / WIDE_TILE_WIDTH).min(clip_bbox[2]); + if tile_x < x_clamped { + if core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + tile_x += 1; + } + // The winding check is probably not needed; if there was a fill, + // the logic below should have advanced tile_x. + if strip.winding == 0 { + for x in tile_x..x_clamped { + self.tiles[tile_y * width_tiles + x].pop_zero_clip(); + } + } + tile_x = x_clamped; + } + let next_strip = &strips[i + 1]; + let strip_width = (next_strip.col - strip.col) as usize; + let x1 = x0 + strip_width; + let xtile0 = (x0 / WIDE_TILE_WIDTH).max(clip_bbox[0]); + let xtile1 = x1.div_ceil(WIDE_TILE_WIDTH).min(clip_bbox[2]); + let mut x = x0; + let mut alpha_ix = strip.col as usize; + if clip_bbox[0] * WIDE_TILE_WIDTH > x { + alpha_ix += clip_bbox[0] * WIDE_TILE_WIDTH - x; + x = clip_bbox[0] * WIDE_TILE_WIDTH; + } + for xtile in xtile0..xtile1 { + if xtile > tile_x && core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + } + let x_tile_rel = (x % WIDE_TILE_WIDTH) as u32; + let width = x1.min((xtile + 1) * WIDE_TILE_WIDTH) - x; + let cmd = CmdClipStrip { + x: x_tile_rel, + width: width as u32, + alpha_ix, + }; + x += width; + alpha_ix += width; + self.tiles[tile_y * width_tiles + xtile].clip_strip(cmd); + tile_x = xtile; + pop_pending = true; + } + if next_strip.winding != 0 && y == next_strip.strip_y() as usize { + let x2 = next_strip.x() as usize; + let tile_x2 = x2.min((tile_x + 1) * WIDE_TILE_WIDTH); + let width = tile_x2 - x1; + if width > 0 { + let x_tile_rel = (x1 % WIDE_TILE_WIDTH) as u32; + self.tiles[tile_y * width_tiles + tile_x].clip_fill(x_tile_rel, width as u32); + } + if x2 > (tile_x + 1) * WIDE_TILE_WIDTH { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + let width2 = x2 % WIDE_TILE_WIDTH; + tile_x = x2 / WIDE_TILE_WIDTH; + if width2 > 0 { + self.tiles[tile_y * width_tiles + tile_x].clip_fill(0, width2 as u32); + } + } + } + } + if core::mem::take(&mut pop_pending) { + self.tiles[tile_y * width_tiles + tile_x].pop_clip(); + tile_x += 1; + } + while tile_y < clip_bbox[3] { + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].pop_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + } + + fn pop_clips(&mut self) { + while self.state_stack.last().unwrap().n_clip > 0 { + self.pop_clip(); + } + } } impl RenderCtx for CsRenderCtx { @@ -200,15 +367,104 @@ impl RenderCtx for CsRenderCtx { } fn clip(&mut self, path: &vello_api::Path) { - todo!() + let affine = self.get_affine(); + crate::flatten::fill(&path.path, affine, &mut self.line_buf); + self.render_path_common(); + let strips = core::mem::take(&mut self.strip_buf); + let n_strips = strips.len(); + let path_bbox = if n_strips <= 1 { + [0, 0, 0, 0] + } else { + let y0 = strips[0].strip_y() as usize; + let y1 = strips[n_strips - 1].strip_y() as usize + 1; + let mut x0 = strips[0].x() as usize / WIDE_TILE_WIDTH; + let mut x1 = x0; + for i in 0..n_strips - 1 { + let strip = &strips[i]; + let next_strip = &strips[i + 1]; + let width = next_strip.col - strip.col; + let x = strip.x() as usize; + x0 = x0.min(x / WIDE_TILE_WIDTH); + x1 = x1.max((x + width as usize).div_ceil(WIDE_TILE_WIDTH)); + } + [x0, x1, y0, y1] + }; + let parent_bbox = self.get_bbox(); + // intersect clip bounding box + let clip_bbox = [ + parent_bbox[0].max(path_bbox[0]), + parent_bbox[1].max(path_bbox[1]), + parent_bbox[2].min(path_bbox[2]), + parent_bbox[3].min(path_bbox[3]), + ]; + // The next bit of code accomplishes the following. For each tile in + // the intersected bounding box, it does one of three things depending + // on the contents of the clip path in that tile. + // If all-zero: push a zero_clip + // If all-one: do nothing + // If contains one or more strips: push a clip + let mut tile_x = clip_bbox[0]; + let mut tile_y = clip_bbox[1]; + let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH); + for i in 0..n_strips - 1 { + let strip = &strips[i]; + let y = strip.strip_y() as usize; + if y < tile_y { + continue; + } + while tile_y < y.min(clip_bbox[3]) { + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].push_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + if tile_y == clip_bbox[3] { + break; + } + let x_pixels = strip.x() as usize; + let x_clamped = (x_pixels / WIDE_TILE_WIDTH).min(clip_bbox[2]); + if tile_x < x_clamped { + if strip.winding == 0 { + for x in tile_x..x_clamped { + self.tiles[tile_y * width_tiles + x].push_zero_clip(); + } + } + // If winding is nonzero, then wide tiles covered entirely + // by sparse fill are no-op (no clipping is applied). + tile_x = x_clamped; + } + let next_strip = &strips[i + 1]; + let width = (next_strip.col - strip.col) as usize; + let x1 = (x_pixels + width) + .div_ceil(WIDE_TILE_WIDTH) + .min(clip_bbox[2]); + if tile_x < x1 { + for x in tile_x..x1 { + self.tiles[tile_y * width_tiles + x].push_clip(); + } + tile_x = x1; + } + } + while tile_y < clip_bbox[3] { + for x in tile_x..clip_bbox[2] { + self.tiles[tile_y * width_tiles + x].push_zero_clip(); + } + tile_x = clip_bbox[0]; + tile_y += 1; + } + let clip = Clip { clip_bbox, strips }; + self.clip_stack.push(clip); + self.state_stack.last_mut().unwrap().n_clip += 1; } fn save(&mut self) { - todo!() + self.state_stack.push(GfxState { n_clip: 0 }); } fn restore(&mut self) { - todo!() + self.pop_clips(); + self.state_stack.pop(); } fn transform(&mut self, affine: vello_api::peniko::kurbo::Affine) { diff --git a/vello_hybrid/src/simd/neon.rs b/vello_hybrid/src/simd/neon.rs index 72bf9202..777b8094 100644 --- a/vello_hybrid/src/simd/neon.rs +++ b/vello_hybrid/src/simd/neon.rs @@ -14,16 +14,18 @@ use crate::{ impl Fine<'_> { pub(crate) unsafe fn clear_simd(&mut self, color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); unsafe { let v_color = vld1q_f32(color.as_ptr()); let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color); for i in 0..WIDE_TILE_WIDTH { - vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4); + vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4); } } } pub(crate) fn pack_simd(&mut self, x: usize, y: usize) { + let scratch = self.scratch.last_mut().unwrap(); unsafe fn cvt(v: float32x4_t) -> uint8x16_t { unsafe { let clamped = vminq_f32(v, vdupq_n_f32(1.0)); @@ -40,14 +42,14 @@ impl Fine<'_> { let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4; for i in (0..WIDE_TILE_WIDTH).step_by(4) { let chunk_ix = base_ix + i * 4; - let v0 = vld1q_f32_x4(self.scratch.as_ptr().add(i * 16)); - let v1 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 1) * 16)); + let v0 = vld1q_f32_x4(scratch.as_ptr().add(i * 16)); + let v1 = vld1q_f32_x4(scratch.as_ptr().add((i + 1) * 16)); let x0 = cvt2(v0.0, v1.0); let x1 = cvt2(v0.1, v1.1); let x2 = cvt2(v0.2, v1.2); let x3 = cvt2(v0.3, v1.3); - let v2 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 2) * 16)); - let v3 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 3) * 16)); + let v2 = vld1q_f32_x4(scratch.as_ptr().add((i + 2) * 16)); + let v3 = vld1q_f32_x4(scratch.as_ptr().add((i + 3) * 16)); let x4 = cvt2(v2.0, v3.0); let y0 = vuzp1q_u8(x0, x4); vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix), y0); @@ -68,24 +70,25 @@ impl Fine<'_> { } pub(crate) unsafe fn fill_simd(&mut self, x: usize, width: usize, color: [f32; 4]) { + let scratch = self.scratch.last_mut().unwrap(); unsafe { let v_color = vld1q_f32(color.as_ptr()); let alpha = color[3]; if alpha == 1.0 { let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color); for i in x..x + width { - vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4); + vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4); } } else { let one_minus_alpha = vdupq_n_f32(1.0 - alpha); for i in x..x + width { let ix = (x + i) * 16; - let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix)); + let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix)); v.0 = vfmaq_f32(v_color, v.0, one_minus_alpha); v.1 = vfmaq_f32(v_color, v.1, one_minus_alpha); v.2 = vfmaq_f32(v_color, v.2, one_minus_alpha); v.3 = vfmaq_f32(v_color, v.3, one_minus_alpha); - vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v); + vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v); } } } @@ -99,6 +102,7 @@ impl Fine<'_> { alphas: &[u32], color: [f32; 4], ) { + let scratch = self.scratch.last_mut().unwrap(); unsafe { debug_assert!(alphas.len() >= width, "overflow of alphas buffer"); let v_color = vmulq_f32(vld1q_f32(color.as_ptr()), vdupq_n_f32(1.0 / 255.0)); @@ -111,13 +115,13 @@ impl Fine<'_> { let a4 = vreinterpretq_u32_u16(vzip1q_u16(a3, vdupq_n_u16(0))); let alpha = vcvtq_f32_u32(a4); let ix = (x + i) * 16; - let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix)); + let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix)); let one_minus_alpha = vfmsq_laneq_f32(vdupq_n_f32(1.0), alpha, v_color, 3); v.0 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 0), v.0, one_minus_alpha, 0); v.1 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 1), v.1, one_minus_alpha, 1); v.2 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 2), v.2, one_minus_alpha, 2); v.3 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 3), v.3, one_minus_alpha, 3); - vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v); + vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v); } } } diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs index 5e71153e..9d8dbcd6 100644 --- a/vello_hybrid/src/wide_tile.rs +++ b/vello_hybrid/src/wide_tile.rs @@ -6,15 +6,24 @@ use vello_api::peniko::color::{AlphaColor, Srgb}; pub(crate) const WIDE_TILE_WIDTH: usize = 256; pub(crate) const STRIP_HEIGHT: usize = 4; +#[derive(Debug)] pub(crate) struct WideTile { pub(crate) bg: AlphaColor, pub(crate) cmds: Vec, + n_zero_clip: usize, + n_clip: usize, } #[derive(Debug)] pub(crate) enum Cmd { Fill(CmdFill), Strip(CmdStrip), + /// Pushes a new transparent buffer to the clip stack. + PushClip, + /// Pops the clip stack. + PopClip, + ClipFill(CmdClipFill), + ClipStrip(CmdClipStrip), } #[derive(Debug)] @@ -33,26 +42,111 @@ pub(crate) struct CmdStrip { pub(crate) color: AlphaColor, } +/// Same as fill, but copies top of clip stack to next on stack. +#[derive(Debug)] +pub(crate) struct CmdClipFill { + pub(crate) x: u32, + pub(crate) width: u32, + // TODO: this should probably get at least an alpha for group opacity + // Also, this is where blend modes go. +} + +/// Same as strip, but composites top of clip stack to next on stack. +#[derive(Debug)] +pub(crate) struct CmdClipStrip { + pub(crate) x: u32, + pub(crate) width: u32, + pub(crate) alpha_ix: usize, + // See `CmdClipFill` for blending extension points +} + impl Default for WideTile { fn default() -> Self { Self { bg: AlphaColor::TRANSPARENT, cmds: vec![], + n_zero_clip: 0, + n_clip: 0, } } } impl WideTile { pub(crate) fn fill(&mut self, x: u32, width: u32, color: AlphaColor) { - if x == 0 && width == WIDE_TILE_WIDTH as u32 && color.components[3] == 1.0 { - self.cmds.clear(); - self.bg = color; - } else { - self.cmds.push(Cmd::Fill(CmdFill { x, width, color })); + if !self.is_zero_clip() { + // Note that we could be more aggressive in optimizing a whole-tile opaque fill + // even with a clip stack. It would be valid to elide all drawing commands from + // the enclosing clip push up to the fill. Further, we could extend the clip + // push command to include a background color, rather than always starting with + // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could + // be replaced with strip/fill with the color (the latter is true even with a + // non-opaque color). + // + // However, the extra cost of tracking such optimizations may outweigh the + // benefit, especially in hybrid mode with GPU painting. + if x == 0 + && width == WIDE_TILE_WIDTH as u32 + && color.components[3] == 1.0 + && self.n_clip == 0 + { + self.cmds.clear(); + self.bg = color; + } else { + self.cmds.push(Cmd::Fill(CmdFill { x, width, color })); + } + } + } + + pub(crate) fn strip(&mut self, cmd_strip: CmdStrip) { + if !self.is_zero_clip() { + self.cmds.push(Cmd::Strip(cmd_strip)); } } pub(crate) fn push(&mut self, cmd: Cmd) { self.cmds.push(cmd); } + + pub(crate) fn push_clip(&mut self) { + if !self.is_zero_clip() { + self.push(Cmd::PushClip); + self.n_clip += 1; + } + } + + pub(crate) fn pop_clip(&mut self) { + if !self.is_zero_clip() { + if matches!(self.cmds.last(), Some(Cmd::PushClip)) { + // Nothing was drawn inside the clip, elide it. + self.cmds.pop(); + } else { + self.push(Cmd::PopClip); + } + self.n_clip -= 1; + } + } + + pub(crate) fn push_zero_clip(&mut self) { + self.n_zero_clip += 1; + } + + pub(crate) fn pop_zero_clip(&mut self) { + self.n_zero_clip -= 1; + } + + pub(crate) fn is_zero_clip(&mut self) -> bool { + self.n_zero_clip > 0 + } + + pub(crate) fn clip_strip(&mut self, cmd_clip_strip: CmdClipStrip) { + if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) { + self.cmds.push(Cmd::ClipStrip(cmd_clip_strip)); + } + } + + pub(crate) fn clip_fill(&mut self, x: u32, width: u32) { + if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) { + self.cmds.push(Cmd::ClipFill(CmdClipFill { x, width })); + } + } }