From 8e11f278bf8117392a4c0e0801c727c37e4cc6d2 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Mon, 17 Feb 2025 18:40:37 -0800
Subject: [PATCH 1/5] Import cpu-sparse prototype

This brings in the cpu-sparse prototype from the piet-next branch of the piet repo. No substantive changes, but cpu-sparse is renamed vello_hybrid and piet-next is renamed vello_api.

Quite a bit of editing to satisfy the lint monster.

There was a half-written SIMD implementation of flattening, that's removed. It should be finished and re-added, as it's a good speedup.
---
 Cargo.lock                      |  60 +++++-
 Cargo.toml                      |   2 +
 vello_api/Cargo.toml            |  23 +++
 vello_api/README.md             |   5 +
 vello_api/src/any.rs            | 254 +++++++++++++++++++++++
 vello_api/src/generic_record.rs | 148 ++++++++++++++
 vello_api/src/lib.rs            | 148 ++++++++++++++
 vello_hybrid/Cargo.toml         |  28 +++
 vello_hybrid/README.md          |   5 +
 vello_hybrid/examples/simple.rs |  41 ++++
 vello_hybrid/examples/svg.rs    | 350 ++++++++++++++++++++++++++++++++
 vello_hybrid/src/fine.rs        | 124 +++++++++++
 vello_hybrid/src/flatten.rs     |  51 +++++
 vello_hybrid/src/lib.rs         |  20 ++
 vello_hybrid/src/pixmap.rs      |  39 ++++
 vello_hybrid/src/render.rs      | 282 +++++++++++++++++++++++++
 vello_hybrid/src/simd.rs        |  84 ++++++++
 vello_hybrid/src/simd/neon.rs   | 232 +++++++++++++++++++++
 vello_hybrid/src/strip.rs       | 209 +++++++++++++++++++
 vello_hybrid/src/tiling.rs      | 305 ++++++++++++++++++++++++++++
 vello_hybrid/src/wide_tile.rs   |  57 ++++++
 21 files changed, 2463 insertions(+), 4 deletions(-)
 create mode 100644 vello_api/Cargo.toml
 create mode 100644 vello_api/README.md
 create mode 100644 vello_api/src/any.rs
 create mode 100644 vello_api/src/generic_record.rs
 create mode 100644 vello_api/src/lib.rs
 create mode 100644 vello_hybrid/Cargo.toml
 create mode 100644 vello_hybrid/README.md
 create mode 100644 vello_hybrid/examples/simple.rs
 create mode 100644 vello_hybrid/examples/svg.rs
 create mode 100644 vello_hybrid/src/fine.rs
 create mode 100644 vello_hybrid/src/flatten.rs
 create mode 100644 vello_hybrid/src/lib.rs
 create mode 100644 vello_hybrid/src/pixmap.rs
 create mode 100644 vello_hybrid/src/render.rs
 create mode 100644 vello_hybrid/src/simd.rs
 create mode 100644 vello_hybrid/src/simd/neon.rs
 create mode 100644 vello_hybrid/src/strip.rs
 create mode 100644 vello_hybrid/src/tiling.rs
 create mode 100644 vello_hybrid/src/wide_tile.rs

diff --git a/Cargo.lock b/Cargo.lock
index c3471e9e..e1b746b7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -424,6 +424,14 @@ dependencies = [
  "unicode-width",
 ]
 
+[[package]]
+name = "color"
+version = "0.1.0"
+source = "git+https://github.com/linebender/color.git?rev=a4fa61aff6c3f292b729dc409e7832e5f0166e4a#a4fa61aff6c3f292b729dc409e7832e5f0166e4a"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "color"
 version = "0.2.3"
@@ -709,6 +717,17 @@ dependencies = [
  "miniz_oxide",
 ]
 
+[[package]]
+name = "flatten"
+version = "0.1.0"
+source = "git+https://github.com/linebender/gpu-stroke-expansion-paper?rev=827ccf6#827ccf6766179340a83f2de3417b7bdd8743706a"
+dependencies = [
+ "arrayvec",
+ "clap",
+ "kurbo",
+ "roxmltree 0.19.0",
+]
+
 [[package]]
 name = "foldhash"
 version = "0.1.4"
@@ -1693,13 +1712,23 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
+[[package]]
+name = "peniko"
+version = "0.2.0"
+source = "git+https://github.com/linebender/peniko?rev=aeded39#aeded39d0ea2fccf7db598c83b83748f3b88195f"
+dependencies = [
+ "color 0.1.0",
+ "kurbo",
+ "smallvec",
+]
+
 [[package]]
 name = "peniko"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2ec061f435b573ff602b2c5690f1a5bfcf461327194654cfbfce4412b95cf2a1"
 dependencies = [
- "color",
+ "color 0.2.3",
  "kurbo",
  "smallvec",
 ]
@@ -1979,6 +2008,12 @@ version = "1.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "19b30a45b0cd0bcca8037f3d0dc3421eaf95327a17cad11964fb8179b4fc4832"
 
+[[package]]
+name = "roxmltree"
+version = "0.19.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3cd14fd5e3b777a7422cca79358c57a8f6e3a703d9ac187448d0daf220c2407f"
+
 [[package]]
 name = "roxmltree"
 version = "0.20.0"
@@ -2048,7 +2083,7 @@ dependencies = [
  "getrandom 0.3.1",
  "image",
  "rand",
- "roxmltree",
+ "roxmltree 0.20.0",
  "skrifa",
  "vello",
  "web-time",
@@ -2581,7 +2616,7 @@ dependencies = [
  "bytemuck",
  "futures-intrusive",
  "log",
- "peniko",
+ "peniko 0.3.1",
  "png",
  "skrifa",
  "static_assertions",
@@ -2592,17 +2627,34 @@ dependencies = [
  "wgpu-profiler",
 ]
 
+[[package]]
+name = "vello_api"
+version = "0.1.0"
+dependencies = [
+ "peniko 0.2.0",
+]
+
 [[package]]
 name = "vello_encoding"
 version = "0.4.0"
 dependencies = [
  "bytemuck",
  "guillotiere",
- "peniko",
+ "peniko 0.3.1",
  "skrifa",
  "smallvec",
 ]
 
+[[package]]
+name = "vello_hybrid"
+version = "0.1.0"
+dependencies = [
+ "flatten",
+ "png",
+ "roxmltree 0.20.0",
+ "vello_api",
+]
+
 [[package]]
 name = "vello_shaders"
 version = "0.4.0"
diff --git a/Cargo.toml b/Cargo.toml
index 6c958715..d460d13d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,9 @@
 resolver = "2"
 members = [
     "vello",
+    "vello_api",
     "vello_encoding",
+    "vello_hybrid",
     "vello_shaders",
     "vello_tests",
 
diff --git a/vello_api/Cargo.toml b/vello_api/Cargo.toml
new file mode 100644
index 00000000..ca133d96
--- /dev/null
+++ b/vello_api/Cargo.toml
@@ -0,0 +1,23 @@
+[package]
+name = "vello_api"
+version = "0.1.0"
+authors = ["Raph Levien <raph.levien@gmail.com>"]
+description = "A testbend for next-generation 2D renderer ideas"
+keywords = ["graphics", "2d"]
+categories = ["graphics"]
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+repository.workspace = true
+
+[package.metadata.docs.rs]
+all-features = true
+# There are no platform specific docs.
+default-target = "x86_64-unknown-linux-gnu"
+targets = []
+
+[lints]
+workspace = true
+
+[dependencies]
+peniko = { git = "https://github.com/linebender/peniko", rev = "aeded39" }
diff --git a/vello_api/README.md b/vello_api/README.md
new file mode 100644
index 00000000..6ff57647
--- /dev/null
+++ b/vello_api/README.md
@@ -0,0 +1,5 @@
+# vello_api
+
+Experiment in API abstraction for 2D rendering.
+
+TODO: explain
diff --git a/vello_api/src/any.rs b/vello_api/src/any.rs
new file mode 100644
index 00000000..1306297f
--- /dev/null
+++ b/vello_api/src/any.rs
@@ -0,0 +1,254 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#![allow(unused, reason = "prototyping")]
+
+use std::{any::Any, sync::Arc};
+
+use peniko::{kurbo::Affine, BrushRef};
+
+use crate::{Id, Path, Record, RenderCtx, ResourceCtx};
+
+#[derive(Clone)]
+pub struct AnyImage {
+    // TODO: move id into trait
+    id: Id,
+    body: Arc<dyn Any + Send + Sync>,
+}
+
+pub trait AnyRecord: Send {
+    fn as_any(&mut self) -> &mut dyn std::any::Any;
+
+    fn dyn_finish(&mut self) -> Arc<dyn Any + Send>;
+}
+
+impl<R: Record + Send + 'static> AnyRecord for R
+where
+    <<R as RenderCtx>::Resource as ResourceCtx>::Recording: Sync,
+{
+    fn as_any(&mut self) -> &mut dyn std::any::Any {
+        self
+    }
+
+    fn dyn_finish(&mut self) -> Arc<dyn Any + Send> {
+        let recording = self.finish();
+        Arc::new(recording)
+    }
+}
+
+pub trait AnyRenderCtx {
+    fn as_any(&mut self) -> &mut dyn std::any::Any;
+
+    fn dyn_playback(&mut self, recording: &Arc<dyn Any + Send>);
+
+    fn dyn_fill(&mut self, path: &Path, brush: BrushRef<'_>);
+}
+
+impl<RC: RenderCtx + 'static> AnyRenderCtx for RC {
+    fn as_any(&mut self) -> &mut dyn std::any::Any {
+        self
+    }
+
+    fn dyn_playback(&mut self, recording: &Arc<dyn Any + Send>) {
+        if let Some(recording) = recording.downcast_ref() {
+            self.playback(recording);
+        } else {
+            panic!("downcast error on playback");
+        }
+    }
+
+    fn dyn_fill(&mut self, path: &Path, brush: BrushRef<'_>) {
+        self.fill(path, brush);
+    }
+}
+
+pub type BoxedRenderCtx = Box<dyn AnyRenderCtx>;
+
+impl RenderCtx for BoxedRenderCtx {
+    type Resource = Box<dyn AnyResourceCtx>;
+
+    fn playback(&mut self, recording: &Arc<<Self::Resource as ResourceCtx>::Recording>) {
+        self.dyn_playback(recording);
+    }
+
+    fn fill(&mut self, path: &Path, brush: BrushRef<'_>) {
+        self.dyn_fill(path, brush);
+    }
+
+    fn stroke(&mut self, path: &Path, stroke: &peniko::kurbo::Stroke, brush: BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_image(
+        &mut self,
+        image: &<Self::Resource as ResourceCtx>::Image,
+        dst_rect: peniko::kurbo::Rect,
+        interp: crate::InterpolationMode,
+    ) {
+        todo!()
+    }
+
+    fn clip(&mut self, path: &Path) {
+        todo!()
+    }
+
+    fn save(&mut self) {
+        todo!()
+    }
+
+    fn restore(&mut self) {
+        todo!()
+    }
+
+    fn transform(&mut self, affine: Affine) {
+        todo!()
+    }
+
+    fn begin_draw_glyphs(&mut self, font: &peniko::Font) {
+        todo!()
+    }
+
+    fn font_size(&mut self, size: f32) {
+        todo!()
+    }
+
+    fn hint(&mut self, hint: bool) {
+        todo!()
+    }
+
+    fn glyph_brush(&mut self, brush: BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_glyphs(
+        &mut self,
+        style: peniko::StyleRef<'_>,
+        glyphs: &dyn Iterator<Item = crate::Glyph>,
+    ) {
+        todo!()
+    }
+
+    fn end_draw_glyphs(&mut self) {
+        todo!()
+    }
+}
+
+pub trait AnyResourceCtx {
+    fn as_any(&mut self) -> &mut dyn std::any::Any;
+
+    fn dyn_record(&mut self) -> Box<dyn AnyRecord + Send>;
+
+    fn dyn_make_image_with_stride(
+        &mut self,
+        width: usize,
+        height: usize,
+        stride: usize,
+        buf: &[u8],
+        format: crate::ImageFormat,
+    ) -> Result<AnyImage, crate::Error>;
+}
+
+impl ResourceCtx for Box<dyn AnyResourceCtx> {
+    type Image = AnyImage;
+
+    type Recording = dyn Any + Send;
+
+    type Record = Box<dyn AnyRecord>;
+
+    fn record(&mut self) -> Self::Record {
+        self.dyn_record()
+    }
+
+    fn make_image_with_stride(
+        &mut self,
+        width: usize,
+        height: usize,
+        stride: usize,
+        buf: &[u8],
+        format: crate::ImageFormat,
+    ) -> Result<Self::Image, crate::Error> {
+        let image = self.dyn_make_image_with_stride(width, height, stride, buf, format)?;
+        let id = Id::get();
+        Ok(AnyImage {
+            id,
+            body: Arc::new(image),
+        })
+    }
+}
+
+pub struct BoxedAnyRecord(Option<Box<dyn AnyRecord>>);
+
+impl RenderCtx for Box<dyn AnyRecord> {
+    type Resource = Box<dyn AnyResourceCtx>;
+
+    fn playback(&mut self, recording: &Arc<<Self::Resource as ResourceCtx>::Recording>) {
+        self.dyn_playback(recording);
+    }
+
+    fn fill(&mut self, path: &Path, brush: BrushRef<'_>) {
+        self.dyn_fill(path, brush);
+    }
+
+    fn stroke(&mut self, path: &Path, stroke: &peniko::kurbo::Stroke, brush: BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_image(
+        &mut self,
+        image: &<Self::Resource as ResourceCtx>::Image,
+        dst_rect: peniko::kurbo::Rect,
+        interp: crate::InterpolationMode,
+    ) {
+        todo!()
+    }
+
+    fn clip(&mut self, path: &Path) {
+        todo!()
+    }
+
+    fn save(&mut self) {
+        todo!()
+    }
+
+    fn restore(&mut self) {
+        todo!()
+    }
+
+    fn transform(&mut self, affine: Affine) {
+        todo!()
+    }
+
+    fn begin_draw_glyphs(&mut self, font: &peniko::Font) {
+        todo!()
+    }
+
+    fn font_size(&mut self, size: f32) {
+        todo!()
+    }
+
+    fn hint(&mut self, hint: bool) {
+        todo!()
+    }
+
+    fn glyph_brush(&mut self, brush: BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_glyphs(
+        &mut self,
+        style: peniko::StyleRef<'_>,
+        glyphs: &dyn Iterator<Item = crate::Glyph>,
+    ) {
+        todo!()
+    }
+
+    fn end_draw_glyphs(&mut self) {
+        todo!()
+    }
+}
+
+impl Record for Box<dyn AnyRecord> {
+    fn finish(&mut self) -> Arc<<Self::Resource as ResourceCtx>::Recording> {
+        self.dyn_finish()
+    }
+}
diff --git a/vello_api/src/generic_record.rs b/vello_api/src/generic_record.rs
new file mode 100644
index 00000000..cd991d5e
--- /dev/null
+++ b/vello_api/src/generic_record.rs
@@ -0,0 +1,148 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#![allow(unused, reason = "prototyping")]
+
+use std::sync::Arc;
+
+use peniko::{kurbo::Rect, Brush};
+
+use crate::{InterpolationMode, Path, Record, RenderCtx, ResourceCtx};
+
+pub struct GenericRecorder<RC: RenderCtx> {
+    cmds: Vec<Cmd<RC>>,
+}
+
+pub struct GenericResources<RC: RenderCtx> {
+    inner: RC::Resource,
+}
+
+enum Cmd<RC: RenderCtx> {
+    Fill(Path, Brush),
+    Image(
+        <RC::Resource as ResourceCtx>::Image,
+        Rect,
+        InterpolationMode,
+    ),
+}
+
+impl<RC: RenderCtx> GenericRecorder<RC> {
+    #[allow(
+        clippy::new_without_default,
+        reason = "didn't we get rid of this lint? it's so annoying"
+    )]
+    pub fn new() -> Self {
+        let cmds = Vec::new();
+        Self { cmds }
+    }
+
+    pub fn play(&self, ctx: &mut RC) {
+        for cmd in &self.cmds {
+            match cmd {
+                Cmd::Fill(path, brush) => ctx.fill(path, brush.into()),
+                Cmd::Image(image, rect, interp) => ctx.draw_image(image, *rect, *interp),
+            }
+        }
+    }
+}
+
+impl<RC: RenderCtx> RenderCtx for GenericRecorder<RC> {
+    type Resource = GenericResources<RC>;
+
+    fn playback(&mut self, recording: &std::sync::Arc<<Self::Resource as ResourceCtx>::Recording>) {
+        todo!()
+    }
+
+    fn fill(&mut self, path: &Path, brush: peniko::BrushRef<'_>) {
+        self.cmds.push(Cmd::Fill(path.clone(), brush.to_owned()));
+    }
+
+    fn stroke(&mut self, path: &Path, stroke: &peniko::kurbo::Stroke, brush: peniko::BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_image(
+        &mut self,
+        image: &<Self::Resource as ResourceCtx>::Image,
+        dst_rect: peniko::kurbo::Rect,
+        interp: crate::InterpolationMode,
+    ) {
+        let image = image.clone();
+        self.cmds.push(Cmd::Image(image, dst_rect, interp));
+    }
+
+    fn clip(&mut self, path: &Path) {
+        todo!()
+    }
+
+    fn save(&mut self) {
+        todo!()
+    }
+
+    fn restore(&mut self) {
+        todo!()
+    }
+
+    fn transform(&mut self, affine: peniko::kurbo::Affine) {
+        todo!()
+    }
+
+    fn begin_draw_glyphs(&mut self, font: &peniko::Font) {
+        todo!()
+    }
+
+    fn font_size(&mut self, size: f32) {
+        todo!()
+    }
+
+    fn hint(&mut self, hint: bool) {
+        todo!()
+    }
+
+    fn glyph_brush(&mut self, brush: peniko::BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_glyphs(
+        &mut self,
+        style: peniko::StyleRef<'_>,
+        glyphs: &dyn Iterator<Item = crate::Glyph>,
+    ) {
+        todo!()
+    }
+
+    fn end_draw_glyphs(&mut self) {
+        todo!()
+    }
+}
+
+impl<RC: RenderCtx> ResourceCtx for GenericResources<RC> {
+    type Image = <RC::Resource as ResourceCtx>::Image;
+
+    type Recording = GenericRecorder<RC>;
+
+    type Record = GenericRecorder<RC>;
+
+    fn record(&mut self) -> Self::Record {
+        GenericRecorder::new()
+    }
+
+    fn make_image_with_stride(
+        &mut self,
+        width: usize,
+        height: usize,
+        stride: usize,
+        buf: &[u8],
+        format: crate::ImageFormat,
+    ) -> Result<Self::Image, crate::Error> {
+        self.inner
+            .make_image_with_stride(width, height, stride, buf, format)
+    }
+}
+
+impl<RC: RenderCtx> Record for GenericRecorder<RC> {
+    fn finish(&mut self) -> Arc<<Self::Resource as ResourceCtx>::Recording> {
+        let cmds = std::mem::take(&mut self.cmds);
+        Arc::new(Self { cmds })
+    }
+}
diff --git a/vello_api/src/lib.rs b/vello_api/src/lib.rs
new file mode 100644
index 00000000..926cd7e9
--- /dev/null
+++ b/vello_api/src/lib.rs
@@ -0,0 +1,148 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#![allow(missing_docs, reason = "will add them later")]
+#![allow(missing_debug_implementations, reason = "prototyping")]
+#![allow(clippy::todo, reason = "still a prototype")]
+
+use std::{
+    num::NonZeroU64,
+    sync::{atomic::AtomicU64, Arc},
+};
+
+pub use peniko;
+
+use peniko::{
+    kurbo::{Affine, BezPath, Rect, Stroke},
+    BrushRef, Font, StyleRef,
+};
+
+mod any;
+mod generic_record;
+
+pub use any::{AnyImage, AnyRecord, AnyRenderCtx, AnyResourceCtx, BoxedAnyRecord, BoxedRenderCtx};
+pub use generic_record::{GenericRecorder, GenericResources};
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash)]
+pub struct Id(NonZeroU64);
+
+// TODO: think this through
+pub type Error = Box<dyn std::error::Error>;
+
+#[derive(Clone)]
+pub struct Path {
+    pub id: Id,
+    pub path: BezPath,
+    // TODO: Vello encoding. kurbo BezPath can be used in interim
+    // Question: probably want to special-case rect, line, ellipse at least
+    // Probably also rounded-rect (incl varying corner radii)
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub enum ImageFormat {
+    Grayscale,
+    Rgb,
+    RgbaSeparate,
+    RgbaPremul,
+}
+
+#[derive(Clone, Copy, PartialEq, Eq, Hash, Debug)]
+pub enum InterpolationMode {
+    NearestNeighbor,
+    Bilinear,
+    // TODO: want to add cubic etc
+}
+
+/// Positioned glyph. This type matches Vello.
+pub struct Glyph {
+    pub id: u32,
+    pub x: f32,
+    pub y: f32,
+}
+
+pub trait RenderCtx {
+    type Resource: ResourceCtx;
+
+    fn playback(&mut self, recording: &Arc<<Self::Resource as ResourceCtx>::Recording>);
+
+    // should even-odd be an arg or another method?
+    fn fill(&mut self, path: &Path, brush: BrushRef<'_>);
+
+    fn stroke(&mut self, path: &Path, stroke: &Stroke, brush: BrushRef<'_>);
+
+    // TODO: clamp/extend/mirror
+    fn draw_image(
+        &mut self,
+        image: &<Self::Resource as ResourceCtx>::Image,
+        dst_rect: Rect,
+        interp: InterpolationMode,
+    );
+
+    fn clip(&mut self, path: &Path);
+
+    fn save(&mut self);
+
+    fn restore(&mut self);
+
+    fn transform(&mut self, affine: Affine);
+
+    /// Start a glyph drawing operation
+    ///
+    /// The glyph drawing operation ends with [`RenderCtx::end_draw_glyphs`]
+    fn begin_draw_glyphs(&mut self, font: &Font);
+
+    // Following methods are borrowed from Vello's DrawGlyph
+    fn font_size(&mut self, size: f32);
+
+    fn hint(&mut self, hint: bool);
+
+    fn glyph_brush(&mut self, brush: BrushRef<'_>);
+
+    fn draw_glyphs(&mut self, style: StyleRef<'_>, glyphs: &dyn Iterator<Item = Glyph>);
+
+    fn end_draw_glyphs(&mut self);
+}
+
+pub trait Record: RenderCtx {
+    // It should be possible to take self by move, but that triggers E0161
+    fn finish(&mut self) -> Arc<<Self::Resource as ResourceCtx>::Recording>;
+}
+
+pub trait ResourceCtx {
+    type Image: Clone + Send;
+
+    type Recording: Send + ?Sized;
+
+    type Record: Record + Send;
+
+    fn record(&mut self) -> Self::Record;
+
+    fn make_image_with_stride(
+        &mut self,
+        width: usize,
+        height: usize,
+        stride: usize,
+        buf: &[u8],
+        format: ImageFormat,
+    ) -> Result<Self::Image, Error>;
+}
+
+static ID_COUNTER: AtomicU64 = AtomicU64::new(0);
+
+impl Id {
+    pub fn get() -> Self {
+        let n = ID_COUNTER.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+        if let Some(x) = n.checked_add(1) {
+            Self(NonZeroU64::new(x).unwrap())
+        } else {
+            panic!("wow, overflow of u64, congratulations")
+        }
+    }
+}
+
+impl From<BezPath> for Path {
+    fn from(path: BezPath) -> Self {
+        let id = Id::get();
+        Self { id, path }
+    }
+}
diff --git a/vello_hybrid/Cargo.toml b/vello_hybrid/Cargo.toml
new file mode 100644
index 00000000..2656de48
--- /dev/null
+++ b/vello_hybrid/Cargo.toml
@@ -0,0 +1,28 @@
+[package]
+name = "vello_hybrid"
+version = "0.1.0"
+authors = ["Raph Levien <raph.levien@gmail.com>"]
+description = "An experimental CPU 2D renderer based on sparse strips"
+keywords = ["graphics", "2d"]
+categories = ["graphics"]
+edition.workspace = true
+rust-version.workspace = true
+license.workspace = true
+repository.workspace = true
+
+[package.metadata.docs.rs]
+all-features = true
+# There are no platform specific docs.
+default-target = "x86_64-unknown-linux-gnu"
+targets = []
+
+[lints]
+workspace = true
+
+[dependencies]
+vello_api = { path = "../vello_api" }
+flatten = { git = "https://github.com/linebender/gpu-stroke-expansion-paper", rev = "827ccf6" }
+
+[dev-dependencies]
+png = "0.17.14"
+roxmltree = "0.20.0"
diff --git a/vello_hybrid/README.md b/vello_hybrid/README.md
new file mode 100644
index 00000000..3764d234
--- /dev/null
+++ b/vello_hybrid/README.md
@@ -0,0 +1,5 @@
+# vello_hybrid
+
+Experiment in hybrid CPU/GPU rendering.
+
+TODO: explain
diff --git a/vello_hybrid/examples/simple.rs b/vello_hybrid/examples/simple.rs
new file mode 100644
index 00000000..e91f0175
--- /dev/null
+++ b/vello_hybrid/examples/simple.rs
@@ -0,0 +1,41 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#![allow(missing_docs, reason = "will add them later")]
+#![allow(clippy::cast_possible_truncation, reason = "we're doing it on purpose")]
+
+use std::io::BufWriter;
+
+use vello_api::peniko::color::palette;
+use vello_api::peniko::kurbo::{BezPath, Stroke};
+use vello_api::RenderCtx;
+use vello_hybrid::{CsRenderCtx, Pixmap};
+
+const WIDTH: usize = 1024;
+const HEIGHT: usize = 256;
+
+pub fn main() {
+    let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT);
+    let mut path = BezPath::new();
+    path.move_to((10.0, 10.0));
+    path.line_to((180.0, 20.0));
+    path.line_to((30.0, 40.0));
+    path.close_path();
+    let piet_path = path.into();
+    ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into());
+    let stroke = Stroke::new(5.0);
+    ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into());
+    if let Some(filename) = std::env::args().nth(1) {
+        let mut pixmap = Pixmap::new(WIDTH, HEIGHT);
+        ctx.render_to_pixmap(&mut pixmap);
+        pixmap.unpremultiply();
+        let file = std::fs::File::create(filename).unwrap();
+        let w = BufWriter::new(file);
+        let mut encoder = png::Encoder::new(w, WIDTH as u32, HEIGHT as u32);
+        encoder.set_color(png::ColorType::Rgba);
+        let mut writer = encoder.write_header().unwrap();
+        writer.write_image_data(pixmap.data()).unwrap();
+    } else {
+        ctx.debug_dump();
+    }
+}
diff --git a/vello_hybrid/examples/svg.rs b/vello_hybrid/examples/svg.rs
new file mode 100644
index 00000000..85312b8c
--- /dev/null
+++ b/vello_hybrid/examples/svg.rs
@@ -0,0 +1,350 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#![allow(missing_docs, reason = "will add them later")]
+#![allow(missing_debug_implementations, reason = "prototyping")]
+#![allow(clippy::cast_possible_truncation, reason = "we're doing it on purpose")]
+
+//! SVG example for hybrid renderer
+
+use std::io::BufWriter;
+use std::str::FromStr;
+
+use roxmltree::{Document, Node};
+use vello_api::peniko::color::palette;
+use vello_api::peniko::kurbo::{Affine, BezPath, Point, Size, Stroke, Vec2};
+use vello_api::peniko::Color;
+use vello_api::RenderCtx;
+use vello_hybrid::{CsRenderCtx, Pixmap};
+
+const WIDTH: usize = 1024;
+const HEIGHT: usize = 1024;
+
+/// The main function of the example. The German word for main is "Haupt".
+pub fn main() {
+    let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT);
+    let mut args = std::env::args().skip(1);
+    let svg_filename = args.next().expect("svg filename is first arg");
+    let out_filename = args.next().expect("png out filename is second arg");
+
+    let svg = std::fs::read_to_string(svg_filename).expect("error reading file");
+    let parsed = PicoSvg::load(&svg, 1.0).expect("error parsing SVG");
+    let mut pixmap = Pixmap::new(WIDTH, HEIGHT);
+    // Hacky code for crude measurements; change this to arg parsing
+    for i in 0..200 {
+        ctx.reset();
+        let start = std::time::Instant::now();
+        render_svg(&mut ctx, &parsed.items);
+        let coarse_time = start.elapsed();
+        ctx.render_to_pixmap(&mut pixmap);
+        if i % 100 == 0 {
+            println!(
+                "time to coarse: {coarse_time:?}, time to fine: {:?}",
+                start.elapsed()
+            );
+        }
+    }
+    pixmap.unpremultiply();
+    let file = std::fs::File::create(out_filename).unwrap();
+    let w = BufWriter::new(file);
+    let mut encoder = png::Encoder::new(w, WIDTH as u32, HEIGHT as u32);
+    encoder.set_color(png::ColorType::Rgba);
+    let mut writer = encoder.write_header().unwrap();
+    writer.write_image_data(pixmap.data()).unwrap();
+}
+
+fn render_svg(ctx: &mut impl RenderCtx, items: &[Item]) {
+    for item in items {
+        match item {
+            Item::Fill(fill_item) => ctx.fill(&fill_item.path, fill_item.color.into()),
+            Item::Stroke(stroke_item) => {
+                let style = Stroke::new(stroke_item.width);
+                ctx.stroke(&stroke_item.path, &style, stroke_item.color.into());
+            }
+            Item::Group(group_item) => {
+                // TODO: apply transform from group
+                render_svg(ctx, &group_item.children);
+            }
+        }
+    }
+}
+
+// Below is copied, lightly adapted, from Vello.
+
+pub struct PicoSvg {
+    pub items: Vec<Item>,
+    #[allow(unused, reason = "functionality NYI")]
+    pub size: Size,
+}
+
+pub enum Item {
+    Fill(FillItem),
+    Stroke(StrokeItem),
+    Group(GroupItem),
+}
+
+pub struct StrokeItem {
+    pub width: f64,
+    pub color: Color,
+    pub path: vello_api::Path,
+}
+
+pub struct FillItem {
+    pub color: Color,
+    pub path: vello_api::Path,
+}
+
+pub struct GroupItem {
+    #[allow(unused, reason = "functionality NYI")]
+    pub affine: Affine,
+    pub children: Vec<Item>,
+}
+
+struct Parser {
+    scale: f64,
+}
+
+impl PicoSvg {
+    pub fn load(xml_string: &str, scale: f64) -> Result<Self, Box<dyn std::error::Error>> {
+        let doc = Document::parse(xml_string)?;
+        let root = doc.root_element();
+        let mut parser = Parser::new(scale);
+        let width = root.attribute("width").and_then(|s| f64::from_str(s).ok());
+        let height = root.attribute("height").and_then(|s| f64::from_str(s).ok());
+        let (origin, viewbox_size) = root
+            .attribute("viewBox")
+            .and_then(|vb_attr| {
+                let vs: Vec<f64> = vb_attr
+                    .split(' ')
+                    .map(|s| f64::from_str(s).unwrap())
+                    .collect();
+                if let &[x, y, vb_width, vb_height] = vs.as_slice() {
+                    Some((
+                        Point { x, y },
+                        Size {
+                            width: vb_width,
+                            height: vb_height,
+                        },
+                    ))
+                } else {
+                    None
+                }
+            })
+            .unzip();
+
+        let mut transform = if let Some(origin) = origin {
+            Affine::translate(origin.to_vec2() * -1.0)
+        } else {
+            Affine::IDENTITY
+        };
+
+        transform *= match (width, height, viewbox_size) {
+            (None, None, Some(_)) => Affine::IDENTITY,
+            (Some(w), Some(h), Some(s)) => {
+                Affine::scale_non_uniform(1.0 / s.width * w, 1.0 / s.height * h)
+            }
+            (Some(w), None, Some(s)) => Affine::scale(1.0 / s.width * w),
+            (None, Some(h), Some(s)) => Affine::scale(1.0 / s.height * h),
+            _ => Affine::IDENTITY,
+        };
+
+        let size = match (width, height, viewbox_size) {
+            (None, None, Some(s)) => s,
+            (mw, mh, None) => Size {
+                width: mw.unwrap_or(300_f64),
+                height: mh.unwrap_or(150_f64),
+            },
+            (Some(w), None, Some(s)) => Size {
+                width: w,
+                height: 1.0 / w * s.width * s.height,
+            },
+            (None, Some(h), Some(s)) => Size {
+                width: 1.0 / h * s.height * s.width,
+                height: h,
+            },
+            (Some(width), Some(height), Some(_)) => Size { width, height },
+        };
+
+        transform *= if scale >= 0.0 {
+            Affine::scale(scale)
+        } else {
+            Affine::new([-scale, 0.0, 0.0, scale, 0.0, 0.0])
+        };
+        let props = RecursiveProperties {
+            fill: Some(Color::BLACK),
+        };
+        // The root element is the svg document element, which we don't care about
+        let mut items = Vec::new();
+        for node in root.children() {
+            parser.rec_parse(node, &props, &mut items)?;
+        }
+        let root_group = Item::Group(GroupItem {
+            affine: transform,
+            children: items,
+        });
+        Ok(Self {
+            items: vec![root_group],
+            size,
+        })
+    }
+}
+
+#[derive(Clone)]
+struct RecursiveProperties {
+    fill: Option<Color>,
+}
+
+impl Parser {
+    fn new(scale: f64) -> Self {
+        Self { scale }
+    }
+
+    fn rec_parse(
+        &mut self,
+        node: Node<'_, '_>,
+        properties: &RecursiveProperties,
+        items: &mut Vec<Item>,
+    ) -> Result<(), Box<dyn std::error::Error>> {
+        if node.is_element() {
+            let mut properties = properties.clone();
+            if let Some(fill_color) = node.attribute("fill") {
+                if fill_color == "none" {
+                    properties.fill = None;
+                } else {
+                    let color = parse_color(fill_color);
+                    let color = modify_opacity(color, "fill-opacity", node);
+                    // TODO: Handle recursive opacity properly
+                    let color = modify_opacity(color, "opacity", node);
+                    properties.fill = Some(color);
+                }
+            }
+            match node.tag_name().name() {
+                "g" => {
+                    let mut children = Vec::new();
+                    let mut affine = Affine::default();
+                    if let Some(transform) = node.attribute("transform") {
+                        affine = parse_transform(transform);
+                    }
+                    for child in node.children() {
+                        self.rec_parse(child, &properties, &mut children)?;
+                    }
+                    items.push(Item::Group(GroupItem { affine, children }));
+                }
+                "path" => {
+                    let d = node.attribute("d").ok_or("missing 'd' attribute")?;
+                    let bp = BezPath::from_svg(d)?;
+                    let path: vello_api::Path = bp.into();
+                    if let Some(color) = properties.fill {
+                        items.push(Item::Fill(FillItem {
+                            color,
+                            path: path.clone(),
+                        }));
+                    }
+                    if let Some(stroke_color) = node.attribute("stroke") {
+                        if stroke_color != "none" {
+                            let width = node
+                                .attribute("stroke-width")
+                                .map(|a| f64::from_str(a).unwrap_or(1.0))
+                                .unwrap_or(1.0)
+                                * self.scale.abs();
+                            let color = parse_color(stroke_color);
+                            let color = modify_opacity(color, "stroke-opacity", node);
+                            // TODO: Handle recursive opacity properly
+                            let color = modify_opacity(color, "opacity", node);
+                            items.push(Item::Stroke(StrokeItem { width, color, path }));
+                        }
+                    }
+                }
+                other => eprintln!("Unhandled node type {other}"),
+            }
+        }
+        Ok(())
+    }
+}
+
+fn parse_transform(transform: &str) -> Affine {
+    let mut nt = Affine::IDENTITY;
+    for ts in transform.split(')').map(str::trim) {
+        nt *= if let Some(s) = ts.strip_prefix("matrix(") {
+            let vals = s
+                .split([',', ' '])
+                .map(str::parse)
+                .collect::<Result<Vec<f64>, _>>()
+                .expect("Could parse all values of 'matrix' as floats");
+            Affine::new(
+                vals.try_into()
+                    .expect("Should be six arguments to `matrix`"),
+            )
+        } else if let Some(s) = ts.strip_prefix("translate(") {
+            if let Ok(vals) = s
+                .split([',', ' '])
+                .map(str::trim)
+                .map(str::parse)
+                .collect::<Result<Vec<f64>, _>>()
+            {
+                match vals.as_slice() {
+                    &[x, y] => Affine::translate(Vec2 { x, y }),
+                    _ => Affine::IDENTITY,
+                }
+            } else {
+                Affine::IDENTITY
+            }
+        } else if let Some(s) = ts.strip_prefix("scale(") {
+            if let Ok(vals) = s
+                .split([',', ' '])
+                .map(str::trim)
+                .map(str::parse)
+                .collect::<Result<Vec<f64>, _>>()
+            {
+                match *vals.as_slice() {
+                    [x, y] => Affine::scale_non_uniform(x, y),
+                    [x] => Affine::scale(x),
+                    _ => Affine::IDENTITY,
+                }
+            } else {
+                Affine::IDENTITY
+            }
+        } else if let Some(s) = ts.strip_prefix("scaleX(") {
+            s.trim()
+                .parse()
+                .ok()
+                .map(|x| Affine::scale_non_uniform(x, 1.0))
+                .unwrap_or(Affine::IDENTITY)
+        } else if let Some(s) = ts.strip_prefix("scaleY(") {
+            s.trim()
+                .parse()
+                .ok()
+                .map(|y| Affine::scale_non_uniform(1.0, y))
+                .unwrap_or(Affine::IDENTITY)
+        } else {
+            if !ts.is_empty() {
+                eprintln!("Did not understand transform attribute {ts:?})");
+            }
+            Affine::IDENTITY
+        };
+    }
+    nt
+}
+
+fn parse_color(color: &str) -> Color {
+    let color = color.trim();
+    if let Ok(c) = vello_api::peniko::color::parse_color(color) {
+        c.to_alpha_color()
+    } else {
+        palette::css::MAGENTA.with_alpha(0.5)
+    }
+}
+
+fn modify_opacity(color: Color, attr_name: &str, node: Node<'_, '_>) -> Color {
+    if let Some(opacity) = node.attribute(attr_name) {
+        let alpha: f64 = if let Some(o) = opacity.strip_suffix('%') {
+            let pctg = o.parse().unwrap_or(100.0);
+            pctg * 0.01
+        } else {
+            opacity.parse().unwrap_or(1.0)
+        };
+        color.with_alpha(alpha as f32)
+    } else {
+        color
+    }
+}
diff --git a/vello_hybrid/src/fine.rs b/vello_hybrid/src/fine.rs
new file mode 100644
index 00000000..4b935528
--- /dev/null
+++ b/vello_hybrid/src/fine.rs
@@ -0,0 +1,124 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Fine rasterization
+
+use crate::wide_tile::{Cmd, STRIP_HEIGHT, WIDE_TILE_WIDTH};
+
+const STRIP_HEIGHT_F32: usize = STRIP_HEIGHT * 4;
+
+pub(crate) struct Fine<'a> {
+    pub(crate) width: usize,
+    pub(crate) height: usize,
+    // rgba pixels
+    pub(crate) out_buf: &'a mut [u8],
+    // f32 RGBA pixels
+    // That said, if we use u8, then this is basically a block of
+    // untyped memory.
+    pub(crate) scratch: [f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4],
+    #[allow(clippy::doc_markdown, reason = "false positive for x86_64")]
+    /// Whether to use SIMD
+    ///
+    /// This is useful to toggle for performance evaluation reasons. It also
+    /// *must* be false if runtime detection fails, otherwise we have safety
+    /// problems. This is important for x86_64, as we'll be targeting Haswell
+    /// as the minimum.
+    #[allow(unused, reason = "some platforms might not have SIMD")]
+    // The allow(unused) lint exception is because some platforms may not have
+    // a SIMD implementation, and thus won't check the field.
+    pub(crate) use_simd: bool,
+}
+
+impl<'a> Fine<'a> {
+    pub(crate) fn new(width: usize, height: usize, out_buf: &'a mut [u8]) -> Self {
+        let scratch = [0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4];
+        Self {
+            width,
+            height,
+            out_buf,
+            scratch,
+            use_simd: true,
+        }
+    }
+
+    pub(crate) fn clear_scalar(&mut self, color: [f32; 4]) {
+        for z in self.scratch.chunks_exact_mut(4) {
+            z.copy_from_slice(&color);
+        }
+    }
+
+    pub(crate) fn pack_scalar(&mut self, x: usize, y: usize) {
+        // Note that these can trigger if the method is called on a pixmap that
+        // is not an integral multiple of the tile.
+        assert!(
+            (x + 1) * WIDE_TILE_WIDTH <= self.width,
+            "overflow of pixmap width"
+        );
+        assert!(
+            (y + 1) * STRIP_HEIGHT <= self.height,
+            "overflow of pixmap height"
+        );
+        let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4;
+        for j in 0..STRIP_HEIGHT {
+            let line_ix = base_ix + j * self.width * 4;
+            for i in 0..WIDE_TILE_WIDTH {
+                let mut rgba_f32 = [0.0; 4];
+                rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);
+                let rgba_u8 = rgba_f32.map(|z| (z * 255.0).round() as u8);
+                self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8);
+            }
+        }
+    }
+
+    pub(crate) fn run_cmd(&mut self, cmd: &Cmd, alphas: &[u32]) {
+        match cmd {
+            Cmd::Fill(f) => {
+                self.fill(f.x as usize, f.width as usize, f.color.components);
+            }
+            Cmd::Strip(s) => {
+                let aslice = &alphas[s.alpha_ix..];
+                self.strip(s.x as usize, s.width as usize, aslice, s.color.components);
+            }
+        }
+    }
+
+    pub(crate) fn fill_scalar(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        if color[3] == 1.0 {
+            for z in
+                self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
+            {
+                z.copy_from_slice(&color);
+            }
+        } else {
+            let one_minus_alpha = 1.0 - color[3];
+            for z in
+                self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
+            {
+                for i in 0..4 {
+                    //z[i] = color[i] + one_minus_alpha * z[i];
+                    // Note: the mul_add will perform poorly on x86_64 default cpu target
+                    // Probably right thing to do is craft a #cfg that detects fma, fcma, etc.
+                    // What we really want is fmuladdf32 from intrinsics!
+                    z[i] = z[i].mul_add(one_minus_alpha, color[i]);
+                }
+            }
+        }
+    }
+
+    pub(crate) fn strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {
+        debug_assert!(alphas.len() >= width, "overflow of alphas buffer");
+        let cs = color.map(|z| z * (1.0 / 255.0));
+        for (z, a) in self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width]
+            .chunks_exact_mut(16)
+            .zip(alphas)
+        {
+            for j in 0..4 {
+                let mask_alpha = ((*a >> (j * 8)) & 0xff) as f32;
+                let one_minus_alpha = 1.0 - mask_alpha * cs[3];
+                for i in 0..4 {
+                    z[j * 4 + i] = z[j * 4 + i].mul_add(one_minus_alpha, mask_alpha * cs[i]);
+                }
+            }
+        }
+    }
+}
diff --git a/vello_hybrid/src/flatten.rs b/vello_hybrid/src/flatten.rs
new file mode 100644
index 00000000..1d0bbc58
--- /dev/null
+++ b/vello_hybrid/src/flatten.rs
@@ -0,0 +1,51 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Utilities for flattening
+
+use flatten::stroke::LoweredPath;
+use vello_api::peniko::kurbo::{self, Affine, BezPath, Line, Point, Stroke};
+
+use crate::tiling::FlatLine;
+
+/// The flattening tolerance
+const TOL: f64 = 0.25;
+
+pub(crate) fn fill(path: &BezPath, affine: Affine, line_buf: &mut Vec<FlatLine>) {
+    line_buf.clear();
+    let mut start = Point::default();
+    let mut p0 = Point::default();
+    let iter = path.iter().map(|el| affine * el);
+    kurbo::flatten(iter, TOL, |el| match el {
+        kurbo::PathEl::MoveTo(p) => {
+            start = p;
+            p0 = p;
+        }
+        kurbo::PathEl::LineTo(p) => {
+            let pt0 = [p0.x as f32, p0.y as f32];
+            let pt1 = [p.x as f32, p.y as f32];
+            line_buf.push(FlatLine::new(pt0, pt1));
+            p0 = p;
+        }
+        kurbo::PathEl::QuadTo(_, _) => unreachable!(),
+        kurbo::PathEl::CurveTo(_, _, _) => unreachable!(),
+        kurbo::PathEl::ClosePath => {
+            let pt0 = [p0.x as f32, p0.y as f32];
+            let pt1 = [start.x as f32, start.y as f32];
+            if pt0 != pt1 {
+                line_buf.push(FlatLine::new(pt0, pt1));
+            }
+        }
+    });
+}
+
+pub(crate) fn stroke(path: &BezPath, style: &Stroke, affine: Affine, line_buf: &mut Vec<FlatLine>) {
+    line_buf.clear();
+    let iter = path.iter().map(|el| affine * el);
+    let lines: LoweredPath<Line> = flatten::stroke::stroke_undashed(iter, style, TOL);
+    for line in &lines.path {
+        let p0 = [line.p0.x as f32, line.p0.y as f32];
+        let p1 = [line.p1.x as f32, line.p1.y as f32];
+        line_buf.push(FlatLine::new(p0, p1));
+    }
+}
diff --git a/vello_hybrid/src/lib.rs b/vello_hybrid/src/lib.rs
new file mode 100644
index 00000000..341c6e90
--- /dev/null
+++ b/vello_hybrid/src/lib.rs
@@ -0,0 +1,20 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+#![allow(missing_docs, reason = "will add them later")]
+#![allow(missing_debug_implementations, reason = "prototyping")]
+#![allow(clippy::todo, reason = "still a prototype")]
+#![allow(clippy::cast_possible_truncation, reason = "we need to do this a lot")]
+
+mod fine;
+mod flatten;
+mod pixmap;
+mod render;
+mod simd;
+mod strip;
+mod tiling;
+mod wide_tile;
+
+pub use pixmap::Pixmap;
+pub use render::{CsRenderCtx, CsResourceCtx};
+pub use tiling::FlatLine;
diff --git a/vello_hybrid/src/pixmap.rs b/vello_hybrid/src/pixmap.rs
new file mode 100644
index 00000000..7a40bef6
--- /dev/null
+++ b/vello_hybrid/src/pixmap.rs
@@ -0,0 +1,39 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! A simple pixmap type
+
+pub struct Pixmap {
+    pub(crate) width: usize,
+    pub(crate) height: usize,
+    pub(crate) buf: Vec<u8>,
+}
+
+impl Pixmap {
+    pub fn new(width: usize, height: usize) -> Self {
+        let buf = vec![0; width * height * 4];
+        Self { width, height, buf }
+    }
+
+    pub fn data(&self) -> &[u8] {
+        &self.buf
+    }
+
+    pub fn data_mut(&mut self) -> &mut [u8] {
+        &mut self.buf
+    }
+
+    /// Convert from premultiplied to separate alpha.
+    ///
+    /// Not fast, but useful for saving to PNG etc.
+    pub fn unpremultiply(&mut self) {
+        for rgba in self.buf.chunks_exact_mut(4) {
+            let alpha = rgba[3] as f32 * (1.0 / 255.0);
+            if alpha != 0.0 {
+                rgba[0] = (rgba[0] as f32 / alpha).round().min(255.0) as u8;
+                rgba[1] = (rgba[1] as f32 / alpha).round().min(255.0) as u8;
+                rgba[2] = (rgba[2] as f32 / alpha).round().min(255.0) as u8;
+            }
+        }
+    }
+}
diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs
new file mode 100644
index 00000000..8216ef5a
--- /dev/null
+++ b/vello_hybrid/src/render.rs
@@ -0,0 +1,282 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// Remove when all methods are implemented.
+#![allow(unused, reason = "lots of unused arguments from todo methods")]
+
+use std::collections::BTreeMap;
+
+use vello_api::{
+    peniko::{
+        color::{palette, AlphaColor, Srgb},
+        kurbo::Affine,
+        BrushRef,
+    },
+    GenericRecorder, RenderCtx, ResourceCtx,
+};
+
+use crate::{
+    fine::Fine,
+    strip::{self, Strip, Tile},
+    tiling::{self, FlatLine},
+    wide_tile::{Cmd, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH},
+    Pixmap,
+};
+
+pub struct CsRenderCtx {
+    width: usize,
+    height: usize,
+    tiles: Vec<WideTile>,
+    alphas: Vec<u32>,
+
+    /// These are all scratch buffers, to be used for path rendering. They're here solely
+    /// so the allocations can be reused.
+    line_buf: Vec<FlatLine>,
+    tile_buf: Vec<Tile>,
+    strip_buf: Vec<Strip>,
+}
+
+pub struct CsResourceCtx;
+
+impl CsRenderCtx {
+    pub fn new(width: usize, height: usize) -> Self {
+        let width_tiles = width.div_ceil(WIDE_TILE_WIDTH);
+        let height_tiles = height.div_ceil(STRIP_HEIGHT);
+        let tiles = (0..width_tiles * height_tiles)
+            .map(|_| WideTile::default())
+            .collect();
+        let alphas = vec![];
+        let line_buf = vec![];
+        let tile_buf = vec![];
+        let strip_buf = vec![];
+        Self {
+            width,
+            height,
+            tiles,
+            alphas,
+            line_buf,
+            tile_buf,
+            strip_buf,
+        }
+    }
+
+    pub fn reset(&mut self) {
+        for tile in &mut self.tiles {
+            tile.bg = AlphaColor::TRANSPARENT;
+            tile.cmds.clear();
+        }
+    }
+
+    pub fn render_to_pixmap(&self, pixmap: &mut Pixmap) {
+        let mut fine = Fine::new(pixmap.width, pixmap.height, &mut pixmap.buf);
+        let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+        let height_tiles = (self.height).div_ceil(STRIP_HEIGHT);
+        for y in 0..height_tiles {
+            for x in 0..width_tiles {
+                let tile = &self.tiles[y * width_tiles + x];
+                fine.clear(tile.bg.components);
+                for cmd in &tile.cmds {
+                    fine.run_cmd(cmd, &self.alphas);
+                }
+                fine.pack(x, y);
+            }
+        }
+    }
+
+    pub fn tile_stats(&self) {
+        let mut histo = BTreeMap::new();
+        let mut total = 0;
+        for tile in &self.tiles {
+            let count = tile.cmds.len();
+            total += count;
+            *histo.entry(count).or_insert(0) += 1;
+        }
+        println!("total = {total}, {histo:?}");
+    }
+
+    /// Render a path, which has already been flattened into `line_buf`.
+    fn render_path(&mut self, brush: BrushRef<'_>) {
+        // TODO: need to make sure tiles contained in viewport - we'll likely
+        // panic otherwise.
+        tiling::make_tiles(&self.line_buf, &mut self.tile_buf);
+        self.tile_buf.sort_unstable_by(Tile::cmp);
+        crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas);
+        let color = brush_to_color(brush);
+        let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH);
+        for i in 0..self.strip_buf.len() - 1 {
+            let strip = &self.strip_buf[i];
+            let next_strip = &self.strip_buf[i + 1];
+            let x0 = strip.x();
+            let y = strip.strip_y();
+            let row_start = y as usize * width_tiles;
+            let strip_width = next_strip.col - strip.col;
+            let x1 = x0 + strip_width;
+            let xtile0 = x0 as usize / WIDE_TILE_WIDTH;
+            let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH);
+            let mut x = x0;
+            let mut col = strip.col;
+            for xtile in xtile0..xtile1 {
+                let x_tile_rel = x % WIDE_TILE_WIDTH as u32;
+                let width = x1.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x;
+                let cmd = CmdStrip {
+                    x: x_tile_rel,
+                    width,
+                    alpha_ix: col as usize,
+                    color,
+                };
+                x += width;
+                col += width;
+                self.tiles[row_start + xtile].push(Cmd::Strip(cmd));
+            }
+            if next_strip.winding != 0 && y == next_strip.strip_y() {
+                x = x1;
+                let x2 = next_strip.x();
+                let fxt0 = x1 as usize / WIDE_TILE_WIDTH;
+                let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH);
+                for xtile in fxt0..fxt1 {
+                    let x_tile_rel = x % WIDE_TILE_WIDTH as u32;
+                    let width = x2.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x;
+                    x += width;
+                    self.tiles[row_start + xtile].fill(x_tile_rel, width, color);
+                }
+            }
+        }
+    }
+
+    pub fn debug_dump(&self) {
+        let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH);
+        for (i, tile) in self.tiles.iter().enumerate() {
+            if !tile.cmds.is_empty() || tile.bg.components[3] != 0.0 {
+                let x = i % width_tiles;
+                let y = i / width_tiles;
+                println!("tile {x}, {y} bg {}", tile.bg.to_rgba8());
+                for cmd in &tile.cmds {
+                    println!("{cmd:?}");
+                }
+            }
+        }
+    }
+
+    fn get_affine(&self) -> Affine {
+        // TODO: get from graphics state
+        Affine::scale(5.0)
+    }
+}
+
+impl RenderCtx for CsRenderCtx {
+    type Resource = CsResourceCtx;
+
+    fn playback(
+        &mut self,
+        recording: &std::sync::Arc<<Self::Resource as vello_api::ResourceCtx>::Recording>,
+    ) {
+        recording.play(self);
+    }
+
+    fn fill(&mut self, path: &vello_api::Path, brush: BrushRef<'_>) {
+        let affine = self.get_affine();
+        crate::flatten::fill(&path.path, affine, &mut self.line_buf);
+        self.render_path(brush);
+    }
+
+    fn stroke(
+        &mut self,
+        path: &vello_api::Path,
+        stroke: &vello_api::peniko::kurbo::Stroke,
+        brush: BrushRef<'_>,
+    ) {
+        let affine = self.get_affine();
+        crate::flatten::stroke(&path.path, stroke, affine, &mut self.line_buf);
+        self.render_path(brush);
+    }
+
+    fn draw_image(
+        &mut self,
+        image: &<Self::Resource as vello_api::ResourceCtx>::Image,
+        dst_rect: vello_api::peniko::kurbo::Rect,
+        interp: vello_api::InterpolationMode,
+    ) {
+        todo!()
+    }
+
+    fn clip(&mut self, path: &vello_api::Path) {
+        todo!()
+    }
+
+    fn save(&mut self) {
+        todo!()
+    }
+
+    fn restore(&mut self) {
+        todo!()
+    }
+
+    fn transform(&mut self, affine: vello_api::peniko::kurbo::Affine) {
+        todo!()
+    }
+
+    fn begin_draw_glyphs(&mut self, font: &vello_api::peniko::Font) {
+        todo!()
+    }
+
+    fn font_size(&mut self, size: f32) {
+        todo!()
+    }
+
+    fn hint(&mut self, hint: bool) {
+        todo!()
+    }
+
+    fn glyph_brush(&mut self, brush: BrushRef<'_>) {
+        todo!()
+    }
+
+    fn draw_glyphs(
+        &mut self,
+        style: vello_api::peniko::StyleRef<'_>,
+        glyphs: &dyn Iterator<Item = vello_api::Glyph>,
+    ) {
+        todo!()
+    }
+
+    fn end_draw_glyphs(&mut self) {
+        todo!()
+    }
+}
+
+impl ResourceCtx for CsResourceCtx {
+    type Image = ();
+
+    type Recording = GenericRecorder<CsRenderCtx>;
+
+    type Record = GenericRecorder<CsRenderCtx>;
+
+    fn record(&mut self) -> Self::Record {
+        GenericRecorder::new()
+    }
+
+    fn make_image_with_stride(
+        &mut self,
+        width: usize,
+        height: usize,
+        stride: usize,
+        buf: &[u8],
+        format: vello_api::ImageFormat,
+    ) -> Result<Self::Image, vello_api::Error> {
+        todo!()
+    }
+}
+
+/// Get the color from the brush.
+///
+/// This is a hacky function that will go away when we implement
+/// other brushes. The general form is to match on whether it's a
+/// solid color. If not, then issue a cmd to render the brush into
+/// a brush buffer, then fill/strip as needed to composite into
+/// the main buffer.
+fn brush_to_color(brush: BrushRef<'_>) -> AlphaColor<Srgb> {
+    match brush {
+        BrushRef::Solid(c) => c,
+        _ => palette::css::MAGENTA,
+    }
+}
diff --git a/vello_hybrid/src/simd.rs b/vello_hybrid/src/simd.rs
new file mode 100644
index 00000000..c2a2614e
--- /dev/null
+++ b/vello_hybrid/src/simd.rs
@@ -0,0 +1,84 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! SIMD speedups
+
+use crate::{
+    fine::Fine,
+    strip::{Strip, Tile},
+};
+
+#[cfg(target_arch = "aarch64")]
+pub(crate) mod neon;
+
+// This block is when we have SIMD
+#[cfg(target_arch = "aarch64")]
+impl Fine<'_> {
+    pub(crate) fn pack(&mut self, x: usize, y: usize) {
+        if self.use_simd {
+            self.pack_simd(x, y);
+        } else {
+            self.pack_scalar(x, y);
+        }
+    }
+
+    pub(crate) fn clear(&mut self, color: [f32; 4]) {
+        if self.use_simd {
+            unsafe {
+                self.clear_simd(color);
+            }
+        } else {
+            self.clear_scalar(color);
+        }
+    }
+
+    pub(crate) fn fill(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        if self.use_simd {
+            unsafe {
+                self.fill_simd(x, width, color);
+            }
+        } else {
+            self.fill_scalar(x, width, color);
+        }
+    }
+
+    pub(crate) fn strip(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {
+        if self.use_simd {
+            unsafe {
+                self.strip_simd(x, width, alphas, color);
+            }
+        } else {
+            self.strip_scalar(x, width, alphas, color);
+        }
+    }
+}
+
+#[cfg(target_arch = "aarch64")]
+pub(crate) fn render_strips(tiles: &[Tile], strip_buf: &mut Vec<Strip>, alpha_buf: &mut Vec<u32>) {
+    neon::render_strips_simd(tiles, strip_buf, alpha_buf);
+}
+
+#[cfg(not(target_arch = "aarch64"))]
+pub fn render_strips(tiles: &[Tile], strip_buf: &mut Vec<Strip>, alpha_buf: &mut Vec<u32>) {
+    crate::strip::render_strips_scalar(tiles, strip_buf, alpha_buf);
+}
+
+// This block is the fallback, no SIMD
+#[cfg(not(target_arch = "aarch64"))]
+impl<'a> Fine<'a> {
+    pub(crate) fn pack(&mut self, x: usize, y: usize) {
+        self.pack_scalar(x, y);
+    }
+
+    pub(crate) fn clear(&mut self, color: [f32; 4]) {
+        self.clear_scalar(color);
+    }
+
+    pub(crate) fn fill(&mut self, x: usize, y: usize, color: [f32; 4]) {
+        self.fill_scalar(x, y, color);
+    }
+
+    pub(crate) fn strip(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {
+        self.strip_scalar(x, width, alphas, color);
+    }
+}
diff --git a/vello_hybrid/src/simd/neon.rs b/vello_hybrid/src/simd/neon.rs
new file mode 100644
index 00000000..72bf9202
--- /dev/null
+++ b/vello_hybrid/src/simd/neon.rs
@@ -0,0 +1,232 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! SIMD speedups for Neon
+
+use core::arch::aarch64::*;
+
+use crate::{
+    fine::Fine,
+    strip::{Strip, Tile},
+    tiling::Vec2,
+    wide_tile::{STRIP_HEIGHT, WIDE_TILE_WIDTH},
+};
+
+impl Fine<'_> {
+    pub(crate) unsafe fn clear_simd(&mut self, color: [f32; 4]) {
+        unsafe {
+            let v_color = vld1q_f32(color.as_ptr());
+            let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color);
+            for i in 0..WIDE_TILE_WIDTH {
+                vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4);
+            }
+        }
+    }
+
+    pub(crate) fn pack_simd(&mut self, x: usize, y: usize) {
+        unsafe fn cvt(v: float32x4_t) -> uint8x16_t {
+            unsafe {
+                let clamped = vminq_f32(v, vdupq_n_f32(1.0));
+                let scaled = vmulq_f32(clamped, vdupq_n_f32(255.0));
+                vreinterpretq_u8_u32(vcvtnq_u32_f32(scaled))
+            }
+        }
+
+        unsafe fn cvt2(v0: float32x4_t, v1: float32x4_t) -> uint8x16_t {
+            unsafe { vuzp1q_u8(cvt(v0), cvt(v1)) }
+        }
+
+        unsafe {
+            let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4;
+            for i in (0..WIDE_TILE_WIDTH).step_by(4) {
+                let chunk_ix = base_ix + i * 4;
+                let v0 = vld1q_f32_x4(self.scratch.as_ptr().add(i * 16));
+                let v1 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 1) * 16));
+                let x0 = cvt2(v0.0, v1.0);
+                let x1 = cvt2(v0.1, v1.1);
+                let x2 = cvt2(v0.2, v1.2);
+                let x3 = cvt2(v0.3, v1.3);
+                let v2 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 2) * 16));
+                let v3 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 3) * 16));
+                let x4 = cvt2(v2.0, v3.0);
+                let y0 = vuzp1q_u8(x0, x4);
+                vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix), y0);
+                let x5 = cvt2(v2.1, v3.1);
+                let y1 = vuzp1q_u8(x1, x5);
+                vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix + self.width * 4), y1);
+                let x6 = cvt2(v2.2, v3.2);
+                let y2 = vuzp1q_u8(x2, x6);
+                vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix + self.width * 8), y2);
+                let x7 = cvt2(v2.3, v3.3);
+                let y3 = vuzp1q_u8(x3, x7);
+                vst1q_u8(
+                    self.out_buf.as_mut_ptr().add(chunk_ix + self.width * 12),
+                    y3,
+                );
+            }
+        }
+    }
+
+    pub(crate) unsafe fn fill_simd(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        unsafe {
+            let v_color = vld1q_f32(color.as_ptr());
+            let alpha = color[3];
+            if alpha == 1.0 {
+                let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color);
+                for i in x..x + width {
+                    vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4);
+                }
+            } else {
+                let one_minus_alpha = vdupq_n_f32(1.0 - alpha);
+                for i in x..x + width {
+                    let ix = (x + i) * 16;
+                    let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix));
+                    v.0 = vfmaq_f32(v_color, v.0, one_minus_alpha);
+                    v.1 = vfmaq_f32(v_color, v.1, one_minus_alpha);
+                    v.2 = vfmaq_f32(v_color, v.2, one_minus_alpha);
+                    v.3 = vfmaq_f32(v_color, v.3, one_minus_alpha);
+                    vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v);
+                }
+            }
+        }
+    }
+
+    #[inline(never)]
+    pub(crate) unsafe fn strip_simd(
+        &mut self,
+        x: usize,
+        width: usize,
+        alphas: &[u32],
+        color: [f32; 4],
+    ) {
+        unsafe {
+            debug_assert!(alphas.len() >= width, "overflow of alphas buffer");
+            let v_color = vmulq_f32(vld1q_f32(color.as_ptr()), vdupq_n_f32(1.0 / 255.0));
+            for i in 0..width {
+                let a = *alphas.get_unchecked(i);
+                // all this zipping compiles to tbl, we should probably just write that
+                let a1 = vreinterpret_u8_u32(vdup_n_u32(a));
+                let a2 = vreinterpret_u16_u8(vzip1_u8(a1, vdup_n_u8(0)));
+                let a3 = vcombine_u16(a2, vdup_n_u16(0));
+                let a4 = vreinterpretq_u32_u16(vzip1q_u16(a3, vdupq_n_u16(0)));
+                let alpha = vcvtq_f32_u32(a4);
+                let ix = (x + i) * 16;
+                let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix));
+                let one_minus_alpha = vfmsq_laneq_f32(vdupq_n_f32(1.0), alpha, v_color, 3);
+                v.0 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 0), v.0, one_minus_alpha, 0);
+                v.1 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 1), v.1, one_minus_alpha, 1);
+                v.2 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 2), v.2, one_minus_alpha, 2);
+                v.3 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 3), v.3, one_minus_alpha, 3);
+                vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v);
+            }
+        }
+    }
+}
+
+#[inline(never)]
+pub(crate) fn render_strips_simd(
+    tiles: &[Tile],
+    strip_buf: &mut Vec<Strip>,
+    alpha_buf: &mut Vec<u32>,
+) {
+    unsafe {
+        strip_buf.clear();
+        let mut strip_start = true;
+        let mut cols = alpha_buf.len() as u32;
+        let mut prev_tile = &tiles[0];
+        let mut fp = prev_tile.footprint().0;
+        let mut seg_start = 0;
+        let mut delta = 0;
+        // Note: the input should contain a sentinel tile, to avoid having
+        // logic here to process the final strip.
+        const IOTA: [f32; 4] = [0.0, 1.0, 2.0, 3.0];
+        let iota = vld1q_f32(IOTA.as_ptr());
+        for i in 1..tiles.len() {
+            let tile = &tiles[i];
+            if prev_tile.loc() != tile.loc() {
+                let start_delta = delta;
+                let same_strip = prev_tile.loc().same_strip(&tile.loc());
+                if same_strip {
+                    fp |= 8;
+                }
+                let x0 = fp.trailing_zeros();
+                let x1 = 32 - fp.leading_zeros();
+                let mut areas = [[start_delta as f32; 4]; 4];
+                for this_tile in &tiles[seg_start..i] {
+                    // small gain possible here to unpack in simd, but llvm goes halfway
+                    delta += this_tile.delta();
+                    let p0 = Vec2::unpack(this_tile.p0);
+                    let p1 = Vec2::unpack(this_tile.p1);
+                    let slope = (p1.x - p0.x) / (p1.y - p0.y);
+                    let vstarty = vsubq_f32(vdupq_n_f32(p0.y), iota);
+                    let vy0 = vminq_f32(vmaxq_f32(vstarty, vdupq_n_f32(0.0)), vdupq_n_f32(1.0));
+                    let vy1a = vsubq_f32(vdupq_n_f32(p1.y), iota);
+                    let vy1 = vminq_f32(vmaxq_f32(vy1a, vdupq_n_f32(0.0)), vdupq_n_f32(1.0));
+                    let vdy = vsubq_f32(vy0, vy1);
+                    let mask = vceqzq_f32(vdy);
+                    let vslope = vbslq_f32(mask, vdupq_n_f32(0.0), vdupq_n_f32(slope));
+                    let vdy0 = vsubq_f32(vy0, vstarty);
+                    let vdy1 = vsubq_f32(vy1, vstarty);
+                    let mut vyedge = vdupq_n_f32(0.0);
+                    if p0.x == 0.0 {
+                        let ye = vsubq_f32(vdupq_n_f32(1.0), vstarty);
+                        vyedge = vminq_f32(vmaxq_f32(ye, vdupq_n_f32(0.0)), vdupq_n_f32(1.0));
+                    } else if p1.x == 0.0 {
+                        let ye = vsubq_f32(vy1a, vdupq_n_f32(1.0));
+                        vyedge = vminq_f32(vmaxq_f32(ye, vdupq_n_f32(-1.0)), vdupq_n_f32(0.0));
+                    }
+                    for x in x0..x1 {
+                        let mut varea = vld1q_f32(areas.as_ptr().add(x as usize) as *const f32);
+                        varea = vaddq_f32(varea, vyedge);
+                        let vstartx = vdupq_n_f32(p0.x - x as f32);
+                        let vxx0 = vfmaq_f32(vstartx, vdy0, vslope);
+                        let vxx1 = vfmaq_f32(vstartx, vdy1, vslope);
+                        let vxmin0 = vminq_f32(vxx0, vxx1);
+                        let vxmax = vmaxq_f32(vxx0, vxx1);
+                        let vxmin =
+                            vsubq_f32(vminq_f32(vxmin0, vdupq_n_f32(1.0)), vdupq_n_f32(1e-6));
+                        let vb = vminq_f32(vxmax, vdupq_n_f32(1.0));
+                        let vc = vmaxq_f32(vb, vdupq_n_f32(0.0));
+                        let vd = vmaxq_f32(vxmin, vdupq_n_f32(0.0));
+                        let vd2 = vmulq_f32(vd, vd);
+                        let vd2c2 = vfmsq_f32(vd2, vc, vc);
+                        let vax = vfmaq_f32(vb, vd2c2, vdupq_n_f32(0.5));
+                        let va = vdivq_f32(vsubq_f32(vax, vxmin), vsubq_f32(vxmax, vxmin));
+                        varea = vfmaq_f32(varea, va, vdy);
+                        vst1q_f32(areas.as_mut_ptr().add(x as usize) as *mut f32, varea);
+                    }
+                }
+                for x in x0..x1 {
+                    let mut alphas = 0_u32;
+                    let varea = vld1q_f32(areas.as_ptr().add(x as usize) as *const f32);
+                    let vnzw = vminq_f32(vabsq_f32(varea), vdupq_n_f32(1.0));
+                    let vscaled = vmulq_f32(vnzw, vdupq_n_f32(255.0));
+                    let vbits = vreinterpretq_u8_u32(vcvtnq_u32_f32(vscaled));
+                    let vbits2 = vuzp1q_u8(vbits, vbits);
+                    let vbits3 = vreinterpretq_u32_u8(vuzp1q_u8(vbits2, vbits2));
+                    vst1q_lane_u32::<0>(&mut alphas, vbits3);
+                    alpha_buf.push(alphas);
+                }
+
+                if strip_start {
+                    let xy = (1 << 18) * prev_tile.y as u32 + 4 * prev_tile.x as u32 + x0;
+                    let strip = Strip {
+                        xy,
+                        col: cols,
+                        winding: start_delta,
+                    };
+                    strip_buf.push(strip);
+                }
+                cols += x1 - x0;
+                fp = if same_strip { 1 } else { 0 };
+                strip_start = !same_strip;
+                seg_start = i;
+                if !prev_tile.loc().same_row(&tile.loc()) {
+                    delta = 0;
+                }
+            }
+            fp |= tile.footprint().0;
+            prev_tile = tile;
+        }
+    }
+}
diff --git a/vello_hybrid/src/strip.rs b/vello_hybrid/src/strip.rs
new file mode 100644
index 00000000..9f3b1e1e
--- /dev/null
+++ b/vello_hybrid/src/strip.rs
@@ -0,0 +1,209 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! CPU implementation of sparse strip rendering
+//!
+//! This is copied from the most recent GPU implementation, but has
+//! `path_id` stripped out, as on CPU we'll be doing one path at a time.
+//! That decision makes sense to some extent even when uploading to
+//! GPU, though some mechanism is required to tie the strips to paint.
+//!
+//! If there becomes a single, unified code base for this, then the
+//! `path_id` type should probably become a generic parameter.
+
+use crate::{tiling::Vec2, wide_tile::STRIP_HEIGHT};
+
+#[derive(Clone, Copy, PartialEq, Eq)]
+pub(crate) struct Loc {
+    x: u16,
+    y: u16,
+}
+
+pub(crate) struct Footprint(pub(crate) u32);
+
+pub(crate) struct Tile {
+    pub x: u16,
+    pub y: u16,
+    pub p0: u32, // packed
+    pub p1: u32, // packed
+}
+
+impl std::fmt::Debug for Tile {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let p0 = Vec2::unpack(self.p0);
+        let p1 = Vec2::unpack(self.p1);
+        write!(
+            f,
+            "Tile {{ xy: ({}, {}), p0: ({:.4}, {:.4}), p1: ({:.4}, {:.4}) }}",
+            self.x, self.y, p0.x, p0.y, p1.x, p1.y
+        )
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct Strip {
+    pub xy: u32, // this could be u16's on the Rust side
+    pub col: u32,
+    pub winding: i32,
+}
+
+impl Loc {
+    pub(crate) fn same_strip(&self, other: &Self) -> bool {
+        self.same_row(other) && (other.x - self.x) / 2 == 0
+    }
+
+    pub(crate) fn same_row(&self, other: &Self) -> bool {
+        self.y == other.y
+    }
+}
+
+impl Tile {
+    #[allow(unused, reason = "only used for synthetic data")]
+    /// Create a tile from synthetic data.
+    fn new(loc: Loc, footprint: Footprint, delta: i32) -> Self {
+        let p0 = (delta == -1) as u32 * 65536 + footprint.0.trailing_zeros() * 8192;
+        let p1 = (delta == 1) as u32 * 65536 + (32 - footprint.0.leading_zeros()) * 8192;
+        Self {
+            x: loc.x,
+            y: loc.y,
+            p0,
+            p1,
+        }
+    }
+
+    pub(crate) fn loc(&self) -> Loc {
+        Loc {
+            x: self.x,
+            y: self.y,
+        }
+    }
+
+    pub(crate) fn footprint(&self) -> Footprint {
+        let x0 = (self.p0 & 0xffff) as f32 * (1.0 / 8192.0);
+        let x1 = (self.p1 & 0xffff) as f32 * (1.0 / 8192.0);
+        // On CPU, might be better to do this as fixed point
+        let xmin = x0.min(x1).floor() as u32;
+        let xmax = (xmin + 1).max(x0.max(x1).ceil() as u32);
+        Footprint((1 << xmax) - (1 << xmin))
+    }
+
+    pub(crate) fn delta(&self) -> i32 {
+        ((self.p1 >> 16) == 0) as i32 - ((self.p0 >> 16) == 0) as i32
+    }
+
+    // Comparison function for sorting. Only compares loc, doesn't care
+    // about points. Unpacking code has been validated to be efficient in
+    // Godbolt.
+    pub(crate) fn cmp(&self, b: &Self) -> std::cmp::Ordering {
+        let xya = ((self.y as u32) << 16) + (self.x as u32);
+        let xyb = ((b.y as u32) << 16) + (b.x as u32);
+        xya.cmp(&xyb)
+    }
+}
+
+// This can be unused when SIMD is selected. Probably a good idea to make it
+// selectable at runtime; will be needed for AVX2.
+#[allow(unused, reason = "may be unused when SIMD is selected")]
+pub(crate) fn render_strips_scalar(
+    tiles: &[Tile],
+    strip_buf: &mut Vec<Strip>,
+    alpha_buf: &mut Vec<u32>,
+) {
+    strip_buf.clear();
+    let mut strip_start = true;
+    let mut cols = alpha_buf.len() as u32;
+    let mut prev_tile = &tiles[0];
+    let mut fp = prev_tile.footprint().0;
+    let mut seg_start = 0;
+    let mut delta = 0;
+    // Note: the input should contain a sentinel tile, to avoid having
+    // logic here to process the final strip.
+    for i in 1..tiles.len() {
+        let tile = &tiles[i];
+        if prev_tile.loc() != tile.loc() {
+            let start_delta = delta;
+            let same_strip = prev_tile.loc().same_strip(&tile.loc());
+            if same_strip {
+                fp |= 8;
+            }
+            let x0 = fp.trailing_zeros();
+            let x1 = 32 - fp.leading_zeros();
+            let mut areas = [[start_delta as f32; 4]; 4];
+            for this_tile in &tiles[seg_start..i] {
+                delta += this_tile.delta();
+                let p0 = Vec2::unpack(this_tile.p0);
+                let p1 = Vec2::unpack(this_tile.p1);
+                let slope = (p1.x - p0.x) / (p1.y - p0.y);
+                for x in x0..x1 {
+                    let startx = p0.x - x as f32;
+                    for y in 0..4 {
+                        let starty = p0.y - y as f32;
+                        let y0 = starty.clamp(0.0, 1.0);
+                        let y1 = (p1.y - y as f32).clamp(0.0, 1.0);
+                        let dy = y0 - y1;
+                        // Note: getting rid of this predicate might help with
+                        // auto-vectorization. That said, just getting rid of
+                        // it causes artifacts (which may be divide by zero).
+                        if dy != 0.0 {
+                            let xx0 = startx + (y0 - starty) * slope;
+                            let xx1 = startx + (y1 - starty) * slope;
+                            let xmin0 = xx0.min(xx1);
+                            let xmax = xx0.max(xx1);
+                            let xmin = xmin0.min(1.0) - 1e-6;
+                            let b = xmax.min(1.0);
+                            let c = b.max(0.0);
+                            let d = xmin.max(0.0);
+                            let a = (b + 0.5 * (d * d - c * c) - xmin) / (xmax - xmin);
+                            areas[x as usize][y] += a * dy;
+                        }
+                        if p0.x == 0.0 {
+                            areas[x as usize][y] += (y as f32 - p0.y + 1.0).clamp(0.0, 1.0);
+                        } else if p1.x == 0.0 {
+                            areas[x as usize][y] -= (y as f32 - p1.y + 1.0).clamp(0.0, 1.0);
+                        }
+                    }
+                }
+            }
+            for x in x0..x1 {
+                let mut alphas = 0_u32;
+                for y in 0..4 {
+                    let area = areas[x as usize][y];
+                    // nonzero winding number rule
+                    let area_u8 = (area.abs().min(1.0) * 255.0).round() as u32;
+                    alphas += area_u8 << (y * 8);
+                }
+                alpha_buf.push(alphas);
+            }
+
+            if strip_start {
+                let xy = (1 << 18) * prev_tile.y as u32 + 4 * prev_tile.x as u32 + x0;
+                let strip = Strip {
+                    xy,
+                    col: cols,
+                    winding: start_delta,
+                };
+                strip_buf.push(strip);
+            }
+            cols += x1 - x0;
+            fp = if same_strip { 1 } else { 0 };
+            strip_start = !same_strip;
+            seg_start = i;
+            if !prev_tile.loc().same_row(&tile.loc()) {
+                delta = 0;
+            }
+        }
+        fp |= tile.footprint().0;
+        prev_tile = tile;
+    }
+}
+
+impl Strip {
+    pub(crate) fn x(&self) -> u32 {
+        self.xy & 0xffff
+    }
+
+    pub(crate) fn strip_y(&self) -> u32 {
+        self.xy / ((1 << 16) * STRIP_HEIGHT as u32)
+    }
+}
diff --git a/vello_hybrid/src/tiling.rs b/vello_hybrid/src/tiling.rs
new file mode 100644
index 00000000..dd6acdb2
--- /dev/null
+++ b/vello_hybrid/src/tiling.rs
@@ -0,0 +1,305 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use crate::strip::Tile;
+
+const TILE_WIDTH: u32 = 4;
+const TILE_HEIGHT: u32 = 4;
+
+const TILE_SCALE_X: f32 = 1.0 / TILE_WIDTH as f32;
+const TILE_SCALE_Y: f32 = 1.0 / TILE_HEIGHT as f32;
+
+/// This is just Line but f32
+#[derive(Clone, Copy, Debug)]
+#[repr(C)]
+pub struct FlatLine {
+    // should these be vec2?
+    pub p0: [f32; 2],
+    pub p1: [f32; 2],
+}
+
+impl FlatLine {
+    pub fn new(p0: [f32; 2], p1: [f32; 2]) -> Self {
+        Self { p0, p1 }
+    }
+}
+
+#[derive(Clone, Copy, Debug)]
+pub(crate) struct Vec2 {
+    pub x: f32,
+    pub y: f32,
+}
+
+const TILE_SCALE: f32 = 8192.0;
+// scale factor relative to unit square in tile
+const FRAC_TILE_SCALE: f32 = 8192.0 * 4.0;
+
+fn scale_up(z: f32) -> u32 {
+    (z * FRAC_TILE_SCALE).round() as u32
+}
+
+impl Vec2 {
+    fn new(x: f32, y: f32) -> Self {
+        Self { x, y }
+    }
+
+    fn from_array(xy: [f32; 2]) -> Self {
+        Self::new(xy[0], xy[1])
+    }
+
+    #[allow(unused, reason = "code might pack by hand")]
+    // Note: this assumes values in range.
+    fn pack(self) -> u32 {
+        // TODO: scale should depend on tile size
+        let x = (self.x * TILE_SCALE).round() as u32;
+        let y = (self.y * TILE_SCALE).round() as u32;
+        (y << 16) + x
+    }
+
+    pub(crate) fn unpack(packed: u32) -> Self {
+        let x = (packed & 0xffff) as f32 * (1.0 / TILE_SCALE);
+        let y = (packed >> 16) as f32 * (1.0 / TILE_SCALE);
+        Self::new(x, y)
+    }
+}
+
+impl std::ops::Add for Vec2 {
+    type Output = Self;
+
+    fn add(self, rhs: Self) -> Self {
+        Self::new(self.x + rhs.x, self.y + rhs.y)
+    }
+}
+
+impl std::ops::Sub for Vec2 {
+    type Output = Self;
+
+    fn sub(self, rhs: Self) -> Self {
+        Self::new(self.x - rhs.x, self.y - rhs.y)
+    }
+}
+
+impl std::ops::Mul<f32> for Vec2 {
+    type Output = Self;
+
+    fn mul(self, rhs: f32) -> Self {
+        Self::new(self.x * rhs, self.y * rhs)
+    }
+}
+
+fn span(a: f32, b: f32) -> u32 {
+    (a.max(b).ceil() - a.min(b).floor()).max(1.0) as u32
+}
+
+pub(crate) fn make_tiles(lines: &[FlatLine], tile_buf: &mut Vec<Tile>) {
+    tile_buf.clear();
+    for line in lines {
+        let p0 = Vec2::from_array(line.p0);
+        let p1 = Vec2::from_array(line.p1);
+        let s0 = p0 * TILE_SCALE_X;
+        let s1 = p1 * TILE_SCALE_Y;
+        let count_x = span(s0.x, s1.x);
+        let count_y = span(s0.y, s1.y);
+        let mut x = s0.x.floor();
+        if s0.x == x && s1.x < x {
+            // s0.x is on right side of first tile
+            x -= 1.0;
+        }
+        let mut y = s0.y.floor();
+        if s0.y == y && s1.y < y {
+            // s0.y is on bottom of first tile
+            y -= 1.0;
+        }
+        let xfrac0 = scale_up(s0.x - x);
+        let yfrac0 = scale_up(s0.y - y);
+        let packed0 = (yfrac0 << 16) + xfrac0;
+        // These could be replaced with <2 and the max(1.0) in span removed
+        if count_x == 1 {
+            let xfrac1 = scale_up(s1.x - x);
+            if count_y == 1 {
+                let yfrac1 = scale_up(s1.y - y);
+                let packed1 = (yfrac1 << 16) + xfrac1;
+                // 1x1 tile
+                tile_buf.push(Tile {
+                    x: x as u16,
+                    y: y as u16,
+                    p0: packed0,
+                    p1: packed1,
+                });
+            } else {
+                // vertical column
+                let slope = (s1.x - s0.x) / (s1.y - s0.y);
+                let sign = (s1.y - s0.y).signum();
+                let mut xclip0 = (s0.x - x) + (y - s0.y) * slope;
+                let yclip = if sign > 0.0 {
+                    xclip0 += slope;
+                    scale_up(1.0)
+                } else {
+                    0
+                };
+                let mut last_packed = packed0;
+                for i in 0..count_y - 1 {
+                    let xclip = xclip0 + i as f32 * sign * slope;
+                    let xfrac = scale_up(xclip).max(1);
+                    let packed = (yclip << 16) + xfrac;
+                    tile_buf.push(Tile {
+                        x: x as u16,
+                        y: (y + i as f32 * sign) as u16,
+                        p0: last_packed,
+                        p1: packed,
+                    });
+                    // flip y between top and bottom of tile
+                    last_packed = packed ^ ((FRAC_TILE_SCALE as u32) << 16);
+                }
+                let yfrac1 = scale_up(s1.y - (y + (count_y - 1) as f32 * sign));
+                let packed1 = (yfrac1 << 16) + xfrac1;
+
+                tile_buf.push(Tile {
+                    x: x as u16,
+                    y: (y + (count_y - 1) as f32 * sign) as u16,
+                    p0: last_packed,
+                    p1: packed1,
+                });
+            }
+        } else if count_y == 1 {
+            // horizontal row
+            let slope = (s1.y - s0.y) / (s1.x - s0.x);
+            let sign = (s1.x - s0.x).signum();
+            let mut yclip0 = (s0.y - y) + (x - s0.x) * slope;
+            let xclip = if sign > 0.0 {
+                yclip0 += slope;
+                scale_up(1.0)
+            } else {
+                0
+            };
+            let mut last_packed = packed0;
+            for i in 0..count_x - 1 {
+                let yclip = yclip0 + i as f32 * sign * slope;
+                let yfrac = scale_up(yclip).max(1);
+                let packed = (yfrac << 16) + xclip;
+                tile_buf.push(Tile {
+                    x: (x + i as f32 * sign) as u16,
+                    y: y as u16,
+                    p0: last_packed,
+                    p1: packed,
+                });
+                // flip x between left and right of tile
+                last_packed = packed ^ (FRAC_TILE_SCALE as u32);
+            }
+            let xfrac1 = scale_up(s1.x - (x + (count_x - 1) as f32 * sign));
+            let yfrac1 = scale_up(s1.y - y);
+            let packed1 = (yfrac1 << 16) + xfrac1;
+
+            tile_buf.push(Tile {
+                x: (x + (count_x - 1) as f32 * sign) as u16,
+                y: y as u16,
+                p0: last_packed,
+                p1: packed1,
+            });
+        } else {
+            // general case
+            let recip_dx = 1.0 / (s1.x - s0.x);
+            let signx = (s1.x - s0.x).signum();
+            let recip_dy = 1.0 / (s1.y - s0.y);
+            let signy = (s1.y - s0.y).signum();
+            // t parameter for next intersection with a vertical grid line
+            let mut t_clipx = (x - s0.x) * recip_dx;
+            let xclip = if signx > 0.0 {
+                t_clipx += recip_dx;
+                scale_up(1.0)
+            } else {
+                0
+            };
+            // t parameter for next intersection with a horizontal grid line
+            let mut t_clipy = (y - s0.y) * recip_dy;
+            let yclip = if signy > 0.0 {
+                t_clipy += recip_dy;
+                scale_up(1.0)
+            } else {
+                0
+            };
+            let x1 = x + (count_x - 1) as f32 * signx;
+            let y1 = y + (count_y - 1) as f32 * signy;
+            let mut xi = x;
+            let mut yi = y;
+            let mut last_packed = packed0;
+            let mut count = 0;
+            while xi != x1 || yi != y1 {
+                count += 1;
+                if count == 400 {
+                    panic!();
+                }
+                if t_clipy < t_clipx {
+                    // intersected with horizontal grid line
+                    let x_intersect = s0.x + (s1.x - s0.x) * t_clipy - xi;
+                    let xfrac = scale_up(x_intersect).max(1); // maybe should clamp?
+                    let packed = (yclip << 16) + xfrac;
+                    tile_buf.push(Tile {
+                        x: xi as u16,
+                        y: yi as u16,
+                        p0: last_packed,
+                        p1: packed,
+                    });
+                    t_clipy += recip_dy.abs();
+                    yi += signy;
+                    last_packed = packed ^ ((FRAC_TILE_SCALE as u32) << 16);
+                } else {
+                    // intersected with vertical grid line
+                    let y_intersect = s0.y + (s1.y - s0.y) * t_clipx - yi;
+                    let yfrac = scale_up(y_intersect).max(1); // maybe should clamp?
+                    let packed = (yfrac << 16) + xclip;
+                    tile_buf.push(Tile {
+                        x: xi as u16,
+                        y: yi as u16,
+                        p0: last_packed,
+                        p1: packed,
+                    });
+                    t_clipx += recip_dx.abs();
+                    xi += signx;
+                    last_packed = packed ^ (FRAC_TILE_SCALE as u32);
+                }
+            }
+            let xfrac1 = scale_up(s1.x - xi);
+            let yfrac1 = scale_up(s1.y - yi);
+            let packed1 = (yfrac1 << 16) + xfrac1;
+
+            tile_buf.push(Tile {
+                x: xi as u16,
+                y: yi as u16,
+                p0: last_packed,
+                p1: packed1,
+            });
+        }
+    }
+    // This particular choice of sentinel tiles generates a sentinel strip.
+    tile_buf.push(Tile {
+        x: 0x3ffd,
+        y: 0x3fff,
+        p0: 0,
+        p1: 0,
+    });
+    tile_buf.push(Tile {
+        x: 0x3fff,
+        y: 0x3fff,
+        p0: 0,
+        p1: 0,
+    });
+}
+
+#[test]
+fn tiling() {
+    let l = FlatLine {
+        p0: [1.3, 1.4],
+        p1: [20.1, 50.2],
+    };
+    let mut tiles = vec![];
+    make_tiles(&[l], &mut tiles);
+    for tile in &tiles {
+        let p0 = Vec2::unpack(tile.p0);
+        let p1 = Vec2::unpack(tile.p1);
+        println!(
+            "@{}, {}: ({}, {}) - ({}, {})",
+            tile.x, tile.y, p0.x, p0.y, p1.x, p1.y
+        );
+    }
+}
diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs
new file mode 100644
index 00000000..fe5296a9
--- /dev/null
+++ b/vello_hybrid/src/wide_tile.rs
@@ -0,0 +1,57 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use vello_api::peniko::color::{AlphaColor, Srgb};
+
+pub(crate) const WIDE_TILE_WIDTH: usize = 256;
+pub(crate) const STRIP_HEIGHT: usize = 4;
+
+pub(crate) struct WideTile {
+    pub(crate) bg: AlphaColor<Srgb>,
+    pub(crate) cmds: Vec<Cmd>,
+}
+
+#[derive(Debug)]
+pub(crate) enum Cmd {
+    Fill(CmdFill),
+    Strip(CmdStrip),
+}
+
+#[derive(Debug)]
+pub(crate) struct CmdFill {
+    pub(crate) x: u32,
+    pub(crate) width: u32,
+    pub(crate) color: AlphaColor<Srgb>,
+}
+
+#[derive(Debug)]
+pub(crate) struct CmdStrip {
+    pub(crate) x: u32,
+    pub(crate) width: u32,
+    pub(crate) alpha_ix: usize,
+    pub(crate) color: AlphaColor<Srgb>,
+}
+
+impl Default for WideTile {
+    fn default() -> Self {
+        Self {
+            bg: AlphaColor::TRANSPARENT,
+            cmds: vec![],
+        }
+    }
+}
+
+impl WideTile {
+    pub(crate) fn fill(&mut self, x: u32, width: u32, color: AlphaColor<Srgb>) {
+        if x == 0 && width == WIDE_TILE_WIDTH as u32 && color.components[3] == 1.0 {
+            self.cmds.clear();
+            self.bg = color;
+        } else {
+            self.cmds.push(Cmd::Fill(CmdFill { x, width, color }));
+        }
+    }
+
+    pub(crate) fn push(&mut self, cmd: Cmd) {
+        self.cmds.push(cmd);
+    }
+}

From eb791309b892958a8bfe5db0e31fc0ffb490291b Mon Sep 17 00:00:00 2001
From: Raph Levien <raph@google.com>
Date: Mon, 17 Feb 2025 20:16:09 -0800
Subject: [PATCH 2/5] Fix lints in non-aarch64 cfg's

---
 vello_hybrid/src/simd.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vello_hybrid/src/simd.rs b/vello_hybrid/src/simd.rs
index c2a2614e..2213b558 100644
--- a/vello_hybrid/src/simd.rs
+++ b/vello_hybrid/src/simd.rs
@@ -59,13 +59,13 @@ pub(crate) fn render_strips(tiles: &[Tile], strip_buf: &mut Vec<Strip>, alpha_bu
 }
 
 #[cfg(not(target_arch = "aarch64"))]
-pub fn render_strips(tiles: &[Tile], strip_buf: &mut Vec<Strip>, alpha_buf: &mut Vec<u32>) {
+pub(crate) fn render_strips(tiles: &[Tile], strip_buf: &mut Vec<Strip>, alpha_buf: &mut Vec<u32>) {
     crate::strip::render_strips_scalar(tiles, strip_buf, alpha_buf);
 }
 
 // This block is the fallback, no SIMD
 #[cfg(not(target_arch = "aarch64"))]
-impl<'a> Fine<'a> {
+impl Fine<'_> {
     pub(crate) fn pack(&mut self, x: usize, y: usize) {
         self.pack_scalar(x, y);
     }

From 23a973d14b0b88a74192a0d0a54ccf45327c2506 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Tue, 18 Feb 2025 18:06:21 -0800
Subject: [PATCH 3/5] Start wiring up GPU render pipeline

Renders a simple scene to the GPU, first by doing coarse rasterization the same as cpu-sparse, then doing a single draw call.
---
 Cargo.lock                      | 471 ++++++++++++++++++++++++++------
 vello_api/Cargo.toml            |   2 +-
 vello_hybrid/Cargo.toml         |   9 +
 vello_hybrid/examples/gpu.rs    | 176 ++++++++++++
 vello_hybrid/shader/render.wgsl |  74 +++++
 vello_hybrid/src/lib.rs         |   5 +
 vello_hybrid/src/render.rs      |   8 +-
 vello_hybrid/src/wide_tile.rs   |   1 +
 8 files changed, 657 insertions(+), 89 deletions(-)
 create mode 100644 vello_hybrid/examples/gpu.rs
 create mode 100644 vello_hybrid/shader/render.wgsl

diff --git a/Cargo.lock b/Cargo.lock
index e1b746b7..61299d27 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -46,6 +46,27 @@ dependencies = [
  "memchr",
 ]
 
+[[package]]
+name = "android-activity"
+version = "0.5.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ee91c0c2905bae44f84bfa4e044536541df26b7703fd0888deeb9060fcc44289"
+dependencies = [
+ "android-properties",
+ "bitflags 2.8.0",
+ "cc",
+ "cesu8",
+ "jni",
+ "jni-sys",
+ "libc",
+ "log",
+ "ndk 0.8.0",
+ "ndk-context",
+ "ndk-sys 0.5.0+25.2.9519653",
+ "num_enum",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "android-activity"
 version = "0.6.0"
@@ -60,7 +81,7 @@ dependencies = [
  "jni-sys",
  "libc",
  "log",
- "ndk",
+ "ndk 0.9.0",
  "ndk-context",
  "ndk-sys 0.6.0+11769913",
  "num_enum",
@@ -191,6 +212,25 @@ dependencies = [
  "libloading",
 ]
 
+[[package]]
+name = "async-executor"
+version = "1.13.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "30ca9a001c1e8ba5149f91a74362376cc6bc5b919d92d988668657bd570bdcec"
+dependencies = [
+ "async-task",
+ "concurrent-queue",
+ "fastrand",
+ "futures-lite",
+ "slab",
+]
+
+[[package]]
+name = "async-task"
+version = "4.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b75356056920673b02621b35afd0f7dda9306d03c79a30f5c56c44cf256e3de"
+
 [[package]]
 name = "atomic-waker"
 version = "1.1.2"
@@ -245,13 +285,32 @@ version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0d8c1fef690941d3e7788d328517591fecc684c084084702d6ff1641e993699a"
 
+[[package]]
+name = "block-sys"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae85a0696e7ea3b835a453750bf002770776609115e6d25c6d2ff28a8200f7e7"
+dependencies = [
+ "objc-sys",
+]
+
+[[package]]
+name = "block2"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15b55663a85f33501257357e6421bb33e769d5c9ffb5ba0921c975a123e35e68"
+dependencies = [
+ "block-sys",
+ "objc2 0.4.1",
+]
+
 [[package]]
 name = "block2"
 version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2c132eebf10f5cad5289222520a4a058514204aed6d791f1cf4fe8088b82d15f"
 dependencies = [
- "objc2",
+ "objc2 0.5.2",
 ]
 
 [[package]]
@@ -298,6 +357,20 @@ version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 
+[[package]]
+name = "calloop"
+version = "0.12.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fba7adb4dd5aa98e5553510223000e7148f621165ec5f9acd7113f6ca4995298"
+dependencies = [
+ "bitflags 2.8.0",
+ "log",
+ "polling",
+ "rustix",
+ "slab",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "calloop"
 version = "0.13.0"
@@ -312,13 +385,25 @@ dependencies = [
  "thiserror 1.0.69",
 ]
 
+[[package]]
+name = "calloop-wayland-source"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0f0ea9b9476c7fad82841a8dbb380e2eae480c21910feba80725b46931ed8f02"
+dependencies = [
+ "calloop 0.12.4",
+ "rustix",
+ "wayland-backend",
+ "wayland-client",
+]
+
 [[package]]
 name = "calloop-wayland-source"
 version = "0.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "95a66a987056935f7efce4ab5668920b5d0dac4a7c99991a67395f13702ddd20"
 dependencies = [
- "calloop",
+ "calloop 0.13.0",
  "rustix",
  "wayland-backend",
  "wayland-client",
@@ -359,6 +444,12 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "cfg_aliases"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fd16c4719339c4530435d38e511904438d07cce7950afa3718a84ac36c10e89e"
+
 [[package]]
 name = "cfg_aliases"
 version = "0.2.1"
@@ -424,14 +515,6 @@ dependencies = [
  "unicode-width",
 ]
 
-[[package]]
-name = "color"
-version = "0.1.0"
-source = "git+https://github.com/linebender/color.git?rev=a4fa61aff6c3f292b729dc409e7832e5f0166e4a#a4fa61aff6c3f292b729dc409e7832e5f0166e4a"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "color"
 version = "0.2.3"
@@ -796,6 +879,25 @@ dependencies = [
  "parking_lot",
 ]
 
+[[package]]
+name = "futures-io"
+version = "0.3.31"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9e5c1b78ca4aae1ac06c48a526a655760685149f0d465d21f37abfe57ce075c6"
+
+[[package]]
+name = "futures-lite"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f5edaec856126859abb19ed65f39e90fea3a9574b9707f13539acf4abf7eb532"
+dependencies = [
+ "fastrand",
+ "futures-core",
+ "futures-io",
+ "parking",
+ "pin-project-lite",
+]
+
 [[package]]
 name = "gethostname"
 version = "0.4.3"
@@ -969,7 +1071,7 @@ dependencies = [
  "env_logger",
  "futures-intrusive",
  "png",
- "pollster",
+ "pollster 0.4.0",
  "scenes",
  "vello",
 ]
@@ -998,6 +1100,17 @@ version = "2.1.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4"
 
+[[package]]
+name = "icrate"
+version = "0.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d3aaff8a54577104bafdf686ff18565c3b6903ca5782a2026ef06e2c7aa319"
+dependencies = [
+ "block2 0.3.0",
+ "dispatch",
+ "objc2 0.4.1",
+]
+
 [[package]]
 name = "id-arena"
 version = "2.2.1"
@@ -1300,7 +1413,7 @@ dependencies = [
  "arrayvec",
  "bit-set",
  "bitflags 2.8.0",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "codespan-reporting",
  "hexf-parse",
  "indexmap 2.7.0",
@@ -1313,6 +1426,21 @@ dependencies = [
  "unicode-xid",
 ]
 
+[[package]]
+name = "ndk"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2076a31b7010b17a38c01907c45b945e8f11495ee4dd588309718901b1f7a5b7"
+dependencies = [
+ "bitflags 2.8.0",
+ "jni-sys",
+ "log",
+ "ndk-sys 0.5.0+25.2.9519653",
+ "num_enum",
+ "raw-window-handle",
+ "thiserror 1.0.69",
+]
+
 [[package]]
 name = "ndk"
 version = "0.9.0"
@@ -1453,6 +1581,16 @@ version = "0.3.5"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cdb91bdd390c7ce1a8607f35f3ca7151b65afc0ff5ff3b34fa350f7d7c7e4310"
 
+[[package]]
+name = "objc2"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "559c5a40fdd30eb5e344fbceacf7595a81e242529fb4e21cf5f43fb4f11ff98d"
+dependencies = [
+ "objc-sys",
+ "objc2-encode 3.0.0",
+]
+
 [[package]]
 name = "objc2"
 version = "0.5.2"
@@ -1460,7 +1598,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "46a785d4eeff09c14c487497c162e92766fbb3e4059a71840cecc03d9a50b804"
 dependencies = [
  "objc-sys",
- "objc2-encode",
+ "objc2-encode 4.0.3",
 ]
 
 [[package]]
@@ -1470,9 +1608,9 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e4e89ad9e3d7d297152b17d39ed92cd50ca8063a89a9fa569046d41568891eff"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
+ "block2 0.5.1",
  "libc",
- "objc2",
+ "objc2 0.5.2",
  "objc2-core-data",
  "objc2-core-image",
  "objc2-foundation",
@@ -1486,8 +1624,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "74dd3b56391c7a0596a295029734d3c1c5e7e510a4cb30245f8221ccea96b009"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-core-location",
  "objc2-foundation",
 ]
@@ -1498,8 +1636,8 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a5ff520e9c33812fd374d8deecef01d4a840e7b41862d849513de77e44aa4889"
 dependencies = [
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-foundation",
 ]
 
@@ -1510,8 +1648,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "617fbf49e071c178c0b24c080767db52958f716d9eabdf0890523aeae54773ef"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-foundation",
 ]
 
@@ -1521,8 +1659,8 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "55260963a527c99f1819c4f8e3b47fe04f9650694ef348ffd2227e8196d34c80"
 dependencies = [
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-foundation",
  "objc2-metal",
 ]
@@ -1533,12 +1671,18 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "000cfee34e683244f284252ee206a27953279d370e309649dc3ee317b37e5781"
 dependencies = [
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-contacts",
  "objc2-foundation",
 ]
 
+[[package]]
+name = "objc2-encode"
+version = "3.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d079845b37af429bfe5dfa76e6d087d788031045b25cfc6fd898486fd9847666"
+
 [[package]]
 name = "objc2-encode"
 version = "4.0.3"
@@ -1552,10 +1696,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0ee638a5da3799329310ad4cfa62fbf045d5f56e3ef5ba4149e7452dcf89d5a8"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
+ "block2 0.5.1",
  "dispatch",
  "libc",
- "objc2",
+ "objc2 0.5.2",
 ]
 
 [[package]]
@@ -1564,8 +1708,8 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a1a1ae721c5e35be65f01a03b6d2ac13a54cb4fa70d8a5da293d7b0020261398"
 dependencies = [
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-app-kit",
  "objc2-foundation",
 ]
@@ -1577,8 +1721,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "dd0cba1276f6023976a406a14ffa85e1fdd19df6b0f737b063b95f6c8c7aadd6"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-foundation",
 ]
 
@@ -1589,8 +1733,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e42bee7bff906b14b167da2bac5efe6b6a07e6f7c0a21a7308d40c960242dc7a"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-foundation",
  "objc2-metal",
 ]
@@ -1601,7 +1745,7 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0a684efe3dec1b305badae1a28f6555f6ddd3bb2c2267896782858d5a78404dc"
 dependencies = [
- "objc2",
+ "objc2 0.5.2",
  "objc2-foundation",
 ]
 
@@ -1612,8 +1756,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b8bb46798b20cd6b91cbd113524c490f1686f4c4e8f49502431415f3512e2b6f"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-cloud-kit",
  "objc2-core-data",
  "objc2-core-image",
@@ -1632,8 +1776,8 @@ version = "0.2.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "44fa5f9748dbfe1ca6c0b79ad20725a11eca7c2218bceb4b005cb1be26273bfe"
 dependencies = [
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-foundation",
 ]
 
@@ -1644,8 +1788,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "76cfcbf642358e8689af64cee815d139339f3ed8ad05103ed5eaf73db8d84cb3"
 dependencies = [
  "bitflags 2.8.0",
- "block2",
- "objc2",
+ "block2 0.5.1",
+ "objc2 0.5.2",
  "objc2-core-location",
  "objc2-foundation",
 ]
@@ -1683,6 +1827,12 @@ dependencies = [
  "ttf-parser",
 ]
 
+[[package]]
+name = "parking"
+version = "2.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba"
+
 [[package]]
 name = "parking_lot"
 version = "0.12.3"
@@ -1712,23 +1862,13 @@ version = "1.0.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a"
 
-[[package]]
-name = "peniko"
-version = "0.2.0"
-source = "git+https://github.com/linebender/peniko?rev=aeded39#aeded39d0ea2fccf7db598c83b83748f3b88195f"
-dependencies = [
- "color 0.1.0",
- "kurbo",
- "smallvec",
-]
-
 [[package]]
 name = "peniko"
 version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2ec061f435b573ff602b2c5690f1a5bfcf461327194654cfbfce4412b95cf2a1"
 dependencies = [
- "color 0.2.3",
+ "color",
  "kurbo",
  "smallvec",
 ]
@@ -1805,6 +1945,12 @@ dependencies = [
  "windows-sys 0.59.0",
 ]
 
+[[package]]
+name = "pollster"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "22686f4785f02a4fcc856d3b3bb19bf6c8160d103f7a99cc258bddd0251dc7f2"
+
 [[package]]
 name = "pollster"
 version = "0.4.0"
@@ -1955,6 +2101,15 @@ dependencies = [
  "font-types",
 ]
 
+[[package]]
+name = "redox_syscall"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "567664f262709473930a4bf9e51bf2ebf3348f2e748ccc50dea20646858f8f29"
+dependencies = [
+ "bitflags 1.3.2",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.4.1"
@@ -2086,7 +2241,7 @@ dependencies = [
  "roxmltree 0.20.0",
  "skrifa",
  "vello",
- "web-time",
+ "web-time 1.1.0",
 ]
 
 [[package]]
@@ -2126,6 +2281,19 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
 
+[[package]]
+name = "sctk-adwaita"
+version = "0.8.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "70b31447ca297092c5a9916fc3b955203157b37c19ca8edde4f52e9843e602c7"
+dependencies = [
+ "ab_glyph",
+ "log",
+ "memmap2",
+ "smithay-client-toolkit 0.18.1",
+ "tiny-skia",
+]
+
 [[package]]
 name = "sctk-adwaita"
 version = "0.10.1"
@@ -2135,7 +2303,7 @@ dependencies = [
  "ab_glyph",
  "log",
  "memmap2",
- "smithay-client-toolkit",
+ "smithay-client-toolkit 0.19.2",
  "tiny-skia",
 ]
 
@@ -2239,16 +2407,16 @@ name = "simple"
 version = "0.0.0"
 dependencies = [
  "anyhow",
- "pollster",
+ "pollster 0.4.0",
  "vello",
- "winit",
+ "winit 0.30.8",
 ]
 
 [[package]]
 name = "simple_sdl2"
 version = "0.0.0"
 dependencies = [
- "pollster",
+ "pollster 0.4.0",
  "sdl2",
  "vello",
 ]
@@ -2287,6 +2455,31 @@ version = "1.13.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
 
+[[package]]
+name = "smithay-client-toolkit"
+version = "0.18.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "922fd3eeab3bd820d76537ce8f582b1cf951eceb5475c28500c7457d9d17f53a"
+dependencies = [
+ "bitflags 2.8.0",
+ "calloop 0.12.4",
+ "calloop-wayland-source 0.2.0",
+ "cursor-icon",
+ "libc",
+ "log",
+ "memmap2",
+ "rustix",
+ "thiserror 1.0.69",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-csd-frame",
+ "wayland-cursor",
+ "wayland-protocols 0.31.2",
+ "wayland-protocols-wlr 0.2.0",
+ "wayland-scanner",
+ "xkeysym",
+]
+
 [[package]]
 name = "smithay-client-toolkit"
 version = "0.19.2"
@@ -2294,8 +2487,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "3457dea1f0eb631b4034d61d4d8c32074caa6cd1ab2d59f2327bd8461e2c0016"
 dependencies = [
  "bitflags 2.8.0",
- "calloop",
- "calloop-wayland-source",
+ "calloop 0.13.0",
+ "calloop-wayland-source 0.3.0",
  "cursor-icon",
  "libc",
  "log",
@@ -2306,8 +2499,8 @@ dependencies = [
  "wayland-client",
  "wayland-csd-frame",
  "wayland-cursor",
- "wayland-protocols",
- "wayland-protocols-wlr",
+ "wayland-protocols 0.32.5",
+ "wayland-protocols-wlr 0.3.5",
  "wayland-scanner",
  "xkeysym",
 ]
@@ -2616,7 +2809,7 @@ dependencies = [
  "bytemuck",
  "futures-intrusive",
  "log",
- "peniko 0.3.1",
+ "peniko",
  "png",
  "skrifa",
  "static_assertions",
@@ -2631,7 +2824,7 @@ dependencies = [
 name = "vello_api"
 version = "0.1.0"
 dependencies = [
- "peniko 0.2.0",
+ "peniko",
 ]
 
 [[package]]
@@ -2640,7 +2833,7 @@ version = "0.4.0"
 dependencies = [
  "bytemuck",
  "guillotiere",
- "peniko 0.3.1",
+ "peniko",
  "skrifa",
  "smallvec",
 ]
@@ -2649,10 +2842,15 @@ dependencies = [
 name = "vello_hybrid"
 version = "0.1.0"
 dependencies = [
+ "async-executor",
+ "bytemuck",
  "flatten",
  "png",
+ "pollster 0.3.0",
  "roxmltree 0.20.0",
  "vello_api",
+ "wgpu",
+ "winit 0.29.15",
 ]
 
 [[package]]
@@ -2674,7 +2872,7 @@ dependencies = [
  "image",
  "nv-flip",
  "png",
- "pollster",
+ "pollster 0.4.0",
  "scenes",
  "vello",
 ]
@@ -2966,6 +3164,18 @@ dependencies = [
  "xcursor",
 ]
 
+[[package]]
+name = "wayland-protocols"
+version = "0.31.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8f81f365b8b4a97f422ac0e8737c438024b5951734506b0e1d775c73030561f4"
+dependencies = [
+ "bitflags 2.8.0",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-scanner",
+]
+
 [[package]]
 name = "wayland-protocols"
 version = "0.32.5"
@@ -2978,6 +3188,19 @@ dependencies = [
  "wayland-scanner",
 ]
 
+[[package]]
+name = "wayland-protocols-plasma"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "23803551115ff9ea9bce586860c5c5a971e360825a0309264102a9495a5ff479"
+dependencies = [
+ "bitflags 2.8.0",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-protocols 0.31.2",
+ "wayland-scanner",
+]
+
 [[package]]
 name = "wayland-protocols-plasma"
 version = "0.3.5"
@@ -2987,7 +3210,20 @@ dependencies = [
  "bitflags 2.8.0",
  "wayland-backend",
  "wayland-client",
- "wayland-protocols",
+ "wayland-protocols 0.32.5",
+ "wayland-scanner",
+]
+
+[[package]]
+name = "wayland-protocols-wlr"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad1f61b76b6c2d8742e10f9ba5c3737f6530b4c243132c2a2ccc8aa96fe25cd6"
+dependencies = [
+ "bitflags 2.8.0",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-protocols 0.31.2",
  "wayland-scanner",
 ]
 
@@ -3000,7 +3236,7 @@ dependencies = [
  "bitflags 2.8.0",
  "wayland-backend",
  "wayland-client",
- "wayland-protocols",
+ "wayland-protocols 0.32.5",
  "wayland-scanner",
 ]
 
@@ -3037,6 +3273,16 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "web-time"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "aa30049b1c872b72c89866d458eae9f20380ab280ffd1b1e18df2d3e2d98cfe0"
+dependencies = [
+ "js-sys",
+ "wasm-bindgen",
+]
+
 [[package]]
 name = "web-time"
 version = "1.1.0"
@@ -3055,7 +3301,7 @@ checksum = "47f55718f85c2fa756edffa0e7f0e0a60aba463d1362b57e23123c58f035e4b6"
 dependencies = [
  "arrayvec",
  "bitflags 2.8.0",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "document-features",
  "js-sys",
  "log",
@@ -3082,7 +3328,7 @@ dependencies = [
  "arrayvec",
  "bit-vec",
  "bitflags 2.8.0",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "document-features",
  "indexmap 2.7.0",
  "log",
@@ -3111,7 +3357,7 @@ dependencies = [
  "bitflags 2.8.0",
  "block",
  "bytemuck",
- "cfg_aliases",
+ "cfg_aliases 0.2.1",
  "core-graphics-types",
  "glow",
  "glutin_wgl_sys",
@@ -3249,6 +3495,15 @@ dependencies = [
  "windows-targets 0.42.2",
 ]
 
+[[package]]
+name = "windows-sys"
+version = "0.48.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9"
+dependencies = [
+ "windows-targets 0.48.5",
+]
+
 [[package]]
 name = "windows-sys"
 version = "0.52.0"
@@ -3445,6 +3700,54 @@ version = "0.52.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
 
+[[package]]
+name = "winit"
+version = "0.29.15"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0d59ad965a635657faf09c8f062badd885748428933dad8e8bdd64064d92e5ca"
+dependencies = [
+ "ahash",
+ "android-activity 0.5.2",
+ "atomic-waker",
+ "bitflags 2.8.0",
+ "bytemuck",
+ "calloop 0.12.4",
+ "cfg_aliases 0.1.1",
+ "core-foundation",
+ "core-graphics",
+ "cursor-icon",
+ "icrate",
+ "js-sys",
+ "libc",
+ "log",
+ "memmap2",
+ "ndk 0.8.0",
+ "ndk-sys 0.5.0+25.2.9519653",
+ "objc2 0.4.1",
+ "once_cell",
+ "orbclient",
+ "percent-encoding",
+ "raw-window-handle",
+ "redox_syscall 0.3.5",
+ "rustix",
+ "sctk-adwaita 0.8.3",
+ "smithay-client-toolkit 0.18.1",
+ "smol_str",
+ "unicode-segmentation",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "wayland-backend",
+ "wayland-client",
+ "wayland-protocols 0.31.2",
+ "wayland-protocols-plasma 0.2.0",
+ "web-sys",
+ "web-time 0.2.4",
+ "windows-sys 0.48.0",
+ "x11-dl",
+ "x11rb",
+ "xkbcommon-dl",
+]
+
 [[package]]
 name = "winit"
 version = "0.30.8"
@@ -3452,13 +3755,13 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f5d74280aabb958072864bff6cfbcf9025cf8bfacdde5e32b5e12920ef703b0f"
 dependencies = [
  "ahash",
- "android-activity",
+ "android-activity 0.6.0",
  "atomic-waker",
  "bitflags 2.8.0",
- "block2",
+ "block2 0.5.1",
  "bytemuck",
- "calloop",
- "cfg_aliases",
+ "calloop 0.13.0",
+ "cfg_aliases 0.2.1",
  "concurrent-queue",
  "core-foundation",
  "core-graphics",
@@ -3467,8 +3770,8 @@ dependencies = [
  "js-sys",
  "libc",
  "memmap2",
- "ndk",
- "objc2",
+ "ndk 0.9.0",
+ "objc2 0.5.2",
  "objc2-app-kit",
  "objc2-foundation",
  "objc2-ui-kit",
@@ -3478,8 +3781,8 @@ dependencies = [
  "raw-window-handle",
  "redox_syscall 0.4.1",
  "rustix",
- "sctk-adwaita",
- "smithay-client-toolkit",
+ "sctk-adwaita 0.10.1",
+ "smithay-client-toolkit 0.19.2",
  "smol_str",
  "tracing",
  "unicode-segmentation",
@@ -3487,10 +3790,10 @@ dependencies = [
  "wasm-bindgen-futures",
  "wayland-backend",
  "wayland-client",
- "wayland-protocols",
- "wayland-protocols-plasma",
+ "wayland-protocols 0.32.5",
+ "wayland-protocols-plasma 0.3.5",
  "web-sys",
- "web-time",
+ "web-time 1.1.0",
  "windows-sys 0.52.0",
  "x11-dl",
  "x11rb",
@@ -3529,7 +3832,7 @@ dependencies = [
  "kurbo",
  "log",
  "notify-debouncer-full",
- "pollster",
+ "pollster 0.4.0",
  "profiling",
  "scenes",
  "tracing",
@@ -3540,9 +3843,9 @@ dependencies = [
  "wasm-bindgen",
  "wasm-bindgen-futures",
  "web-sys",
- "web-time",
+ "web-time 1.1.0",
  "wgpu-profiler",
- "winit",
+ "winit 0.30.8",
 ]
 
 [[package]]
diff --git a/vello_api/Cargo.toml b/vello_api/Cargo.toml
index ca133d96..e79892ba 100644
--- a/vello_api/Cargo.toml
+++ b/vello_api/Cargo.toml
@@ -20,4 +20,4 @@ targets = []
 workspace = true
 
 [dependencies]
-peniko = { git = "https://github.com/linebender/peniko", rev = "aeded39" }
+peniko = "0.3.1"
diff --git a/vello_hybrid/Cargo.toml b/vello_hybrid/Cargo.toml
index 2656de48..b41b1f9d 100644
--- a/vello_hybrid/Cargo.toml
+++ b/vello_hybrid/Cargo.toml
@@ -23,6 +23,15 @@ workspace = true
 vello_api = { path = "../vello_api" }
 flatten = { git = "https://github.com/linebender/gpu-stroke-expansion-paper", rev = "827ccf6" }
 
+# The following dependencies are for GPU. Maybe they should be optional, or the crate
+# structure refactored so there's a CPU-only renderer that doesn't depend on these.
+
+wgpu = { version = "24.0.1" }
+winit = "0.29"
+pollster = "0.3"
+async-executor = "1.8"
+bytemuck = { version = "1.14", features = ["derive"] }
+
 [dev-dependencies]
 png = "0.17.14"
 roxmltree = "0.20.0"
diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs
new file mode 100644
index 00000000..d5d9fe51
--- /dev/null
+++ b/vello_hybrid/examples/gpu.rs
@@ -0,0 +1,176 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! Example program for CPU/GPU hybrid rendering.
+
+use std::sync::Arc;
+
+use bytemuck::{Pod, Zeroable};
+use vello_api::peniko::{color::palette, kurbo::{BezPath, Stroke}};
+use vello_hybrid::{GpuRenderCtx, GpuSession};
+use wgpu::util::DeviceExt;
+
+use winit::{
+    event::{Event, WindowEvent},
+    event_loop::EventLoop,
+    window::Window,
+};
+
+#[repr(C)]
+#[derive(Copy, Clone, Pod, Zeroable)]
+struct Config {
+    width: u32,
+    height: u32,
+    strip_height: u32,
+}
+
+async fn run(event_loop: EventLoop<()>, window: Window) {
+    let window = Arc::new(window);
+    let window_clone = window.clone();
+    let instance = wgpu::Instance::new(&Default::default());
+    let surface = instance.create_surface(&window).unwrap();
+    let adapter = instance
+        .request_adapter(&wgpu::RequestAdapterOptions {
+            power_preference: Default::default(),
+            force_fallback_adapter: false,
+            compatible_surface: Some(&surface),
+        })
+        .await
+        .expect("error finding adapter");
+
+    let (device, queue) = adapter
+        .request_device(&Default::default(), None)
+        .await
+        .expect("error creating device");
+    let size = window.inner_size();
+    let swapchain_capabilities = surface.get_capabilities(&adapter);
+    let format = swapchain_capabilities.formats[0];
+    let sc = wgpu::SurfaceConfiguration {
+        usage: wgpu::TextureUsages::RENDER_ATTACHMENT,
+        format,
+        width: size.width,
+        height: size.height,
+        present_mode: wgpu::PresentMode::Fifo,
+        alpha_mode: swapchain_capabilities.alpha_modes[0],
+        view_formats: vec![],
+        desired_maximum_frame_latency: 2,
+    };
+    surface.configure(&device, &sc);
+
+    let session = GpuSession::new(&device, format);
+    // TODO: actually render something
+    let mut render_ctx = GpuRenderCtx::new(size.width as usize, size.height as usize);
+    draw_simple_scene(&mut render_ctx);
+    let bufs = render_ctx.harvest();
+
+    let config = Config {
+        width: size.width,
+        height: size.height,
+        strip_height: 4,
+    };
+
+    let config_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("config"),
+        contents: bytemuck::bytes_of(&config),
+        usage: wgpu::BufferUsages::UNIFORM,
+    });
+
+    let strip_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("strip"),
+        contents: bytemuck::cast_slice(&bufs.strips),
+        usage: wgpu::BufferUsages::STORAGE,
+    });
+    let alpha_buf = device.create_buffer_init(&wgpu::util::BufferInitDescriptor {
+        label: Some("alpha"),
+        contents: bytemuck::cast_slice(&bufs.alphas),
+        usage: wgpu::BufferUsages::STORAGE,
+    });
+
+    let render_bind_group = device.create_bind_group(&wgpu::BindGroupDescriptor {
+        label: None,
+        layout: &session.render_bind_group_layout,
+        entries: &[
+            wgpu::BindGroupEntry {
+                binding: 0,
+                resource: alpha_buf.as_entire_binding(),
+            },
+            wgpu::BindGroupEntry {
+                binding: 1,
+                resource: config_buf.as_entire_binding(),
+            },
+            wgpu::BindGroupEntry {
+                binding: 2,
+                resource: strip_buf.as_entire_binding(),
+            },
+        ],
+    });
+
+    event_loop
+        .run(move |event, target| {
+            if let Event::WindowEvent {
+                window_id: _,
+                event,
+            } = event
+            {
+                match event {
+                    WindowEvent::RedrawRequested => {
+                        let frame = surface
+                            .get_current_texture()
+                            .expect("error getting texture from swap chain");
+
+                        let mut encoder = device.create_command_encoder(&Default::default());
+                        {
+                            let view = frame
+                                .texture
+                                .create_view(&wgpu::TextureViewDescriptor::default());
+                            let mut rpass =
+                                encoder.begin_render_pass(&wgpu::RenderPassDescriptor {
+                                    label: None,
+                                    color_attachments: &[Some(wgpu::RenderPassColorAttachment {
+                                        view: &view,
+                                        resolve_target: None,
+                                        ops: wgpu::Operations {
+                                            load: wgpu::LoadOp::Clear(wgpu::Color::BLACK),
+                                            store: wgpu::StoreOp::Store,
+                                        },
+                                    })],
+                                    depth_stencil_attachment: None,
+                                    timestamp_writes: None,
+                                    occlusion_query_set: None,
+                                });
+                            rpass.set_pipeline(&session.render_pipeline);
+                            rpass.set_bind_group(0, &render_bind_group, &[]);
+                            rpass.draw(0..4, 0..bufs.strips.len() as u32);
+                        }
+                        queue.submit(Some(encoder.finish()));
+                        frame.present();
+                        window_clone.request_redraw();
+                    }
+                    WindowEvent::CloseRequested => {
+                        target.exit();
+                    }
+                    _ => (),
+                }
+            }
+        })
+        .unwrap();
+}
+
+fn draw_simple_scene(ctx: &mut GpuRenderCtx) {
+    let mut path = BezPath::new();
+    path.move_to((10.0, 10.0));
+    path.line_to((180.0, 20.0));
+    path.line_to((30.0, 40.0));
+    path.close_path();
+    let piet_path = path.into();
+    ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into());
+    let stroke = Stroke::new(5.0);
+    ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into());
+}
+
+fn main() {
+    let event_loop = EventLoop::new().unwrap();
+    let window = Window::new(&event_loop).unwrap();
+    window.set_resizable(false);
+    pollster::block_on(run(event_loop, window));
+}
diff --git a/vello_hybrid/shader/render.wgsl b/vello_hybrid/shader/render.wgsl
new file mode 100644
index 00000000..4b1ee1f6
--- /dev/null
+++ b/vello_hybrid/shader/render.wgsl
@@ -0,0 +1,74 @@
+// Copyright 2024 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+// A simple render pipeline for solid color sparse strip rendering.
+
+// Each instance draws one strip consisting of alpha values (dense_width)
+// then a solid region.
+
+struct VertexOutput {
+    @location(0) tex_coord: vec2<f32>,
+    @location(1) @interpolate(flat) dense_end: u32,
+    @location(2) @interpolate(flat) color: u32,
+    @builtin(position) position: vec4<f32>,
+};
+
+struct Config {
+    width: u32,
+    height: u32,
+    strip_height: u32,
+}
+
+struct Strip {
+    xy: u32, // this could be u16's on the Rust side
+    // [width, dense_width] packed as u16's
+    widths: u32,
+    col: u32,
+    rgba: u32,
+}
+
+@group(0) @binding(1)
+var<uniform> config: Config;
+
+@group(0) @binding(2)
+var<storage> strips: array<Strip>;
+
+@vertex
+fn vs_main(
+    @builtin(vertex_index) in_vertex_index: u32,
+    @builtin(instance_index) in_instance_index: u32
+) -> VertexOutput {
+    var out: VertexOutput;
+    let x = f32(in_vertex_index & 1u);
+    let y = f32(in_vertex_index >> 1u);
+    let strip = strips[in_instance_index];
+    let next_strip = strips[in_instance_index + 1u];
+    let x0 = strip.xy & 0xffffu;
+    let y0 = strip.xy >> 16u;
+    let width = strip.widths & 0xffffu;
+    let dense_width = strip.widths >> 16u;
+    out.dense_end = strip.col + dense_width;
+    let pix_x = f32(x0) + f32(width) * x;
+    let pix_y = f32(y0) + y * f32(config.strip_height);
+    let gl_x = (pix_x + 0.5) * 2.0 / f32(config.width) - 1.0;
+    let gl_y = 1.0 - (pix_y + 0.5) * 2.0 / f32(config.height);
+    out.position = vec4<f32>(gl_x, gl_y, 0.0, 1.0);
+    out.tex_coord = vec2<f32>(f32(strip.col) + x * f32(width), y * f32(config.strip_height));
+    out.color = strip.rgba;
+    return out;
+}
+
+@group(0) @binding(0)
+var<storage> alphas: array<u32>;
+
+@fragment
+fn fs_main(in: VertexOutput) -> @location(0) vec4<f32> {
+    let x = u32(floor(in.tex_coord.x));
+    var alpha = 1.0;
+    if x < in.dense_end {
+        let y = u32(floor(in.tex_coord.y));
+        let a = alphas[x];
+        alpha = f32((a >> (y * 8u)) & 0xffu) * (1.0 / 255.0);
+    }
+    return alpha * unpack4x8unorm(in.color);
+}
diff --git a/vello_hybrid/src/lib.rs b/vello_hybrid/src/lib.rs
index 341c6e90..fdfedf89 100644
--- a/vello_hybrid/src/lib.rs
+++ b/vello_hybrid/src/lib.rs
@@ -8,6 +8,7 @@
 
 mod fine;
 mod flatten;
+mod gpu;
 mod pixmap;
 mod render;
 mod simd;
@@ -15,6 +16,10 @@ mod strip;
 mod tiling;
 mod wide_tile;
 
+pub use gpu::{GpuRenderBufs, GpuRenderCtx, GpuSession};
 pub use pixmap::Pixmap;
 pub use render::{CsRenderCtx, CsResourceCtx};
 pub use tiling::FlatLine;
+
+// TODO: this export should be removed, buffer upload will be internal
+pub use gpu::Strip;
diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs
index 8216ef5a..1c7bfa67 100644
--- a/vello_hybrid/src/render.rs
+++ b/vello_hybrid/src/render.rs
@@ -24,10 +24,10 @@ use crate::{
 };
 
 pub struct CsRenderCtx {
-    width: usize,
-    height: usize,
-    tiles: Vec<WideTile>,
-    alphas: Vec<u32>,
+    pub(crate) width: usize,
+    pub(crate) height: usize,
+    pub(crate) tiles: Vec<WideTile>,
+    pub(crate) alphas: Vec<u32>,
 
     /// These are all scratch buffers, to be used for path rendering. They're here solely
     /// so the allocations can be reused.
diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs
index fe5296a9..5e71153e 100644
--- a/vello_hybrid/src/wide_tile.rs
+++ b/vello_hybrid/src/wide_tile.rs
@@ -21,6 +21,7 @@ pub(crate) enum Cmd {
 pub(crate) struct CmdFill {
     pub(crate) x: u32,
     pub(crate) width: u32,
+    // TODO: Probably want this pre-packed to u32 to avoid packing cost
     pub(crate) color: AlphaColor<Srgb>,
 }
 

From cdeecc758abeca4f949ab97961915079f95b691f Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Wed, 19 Feb 2025 12:24:04 -0800
Subject: [PATCH 4/5] Add missing file, fix lints

---
 vello_hybrid/examples/gpu.rs |  12 +-
 vello_hybrid/src/gpu.rs      | 212 +++++++++++++++++++++++++++++++++++
 2 files changed, 220 insertions(+), 4 deletions(-)
 create mode 100644 vello_hybrid/src/gpu.rs

diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs
index d5d9fe51..eeb6a1ba 100644
--- a/vello_hybrid/examples/gpu.rs
+++ b/vello_hybrid/examples/gpu.rs
@@ -6,7 +6,10 @@
 use std::sync::Arc;
 
 use bytemuck::{Pod, Zeroable};
-use vello_api::peniko::{color::palette, kurbo::{BezPath, Stroke}};
+use vello_api::peniko::{
+    color::palette,
+    kurbo::{BezPath, Stroke},
+};
 use vello_hybrid::{GpuRenderCtx, GpuSession};
 use wgpu::util::DeviceExt;
 
@@ -109,10 +112,10 @@ async fn run(event_loop: EventLoop<()>, window: Window) {
         .run(move |event, target| {
             if let Event::WindowEvent {
                 window_id: _,
-                event,
+                event: window_event,
             } = event
             {
-                match event {
+                match window_event {
                     WindowEvent::RedrawRequested => {
                         let frame = surface
                             .get_current_texture()
@@ -140,7 +143,8 @@ async fn run(event_loop: EventLoop<()>, window: Window) {
                                 });
                             rpass.set_pipeline(&session.render_pipeline);
                             rpass.set_bind_group(0, &render_bind_group, &[]);
-                            rpass.draw(0..4, 0..bufs.strips.len() as u32);
+                            let n_strips = bufs.strips.len().try_into().expect("too many strips");
+                            rpass.draw(0..4, 0..n_strips);
                         }
                         queue.submit(Some(encoder.finish()));
                         frame.present();
diff --git a/vello_hybrid/src/gpu.rs b/vello_hybrid/src/gpu.rs
new file mode 100644
index 00000000..b4ed688a
--- /dev/null
+++ b/vello_hybrid/src/gpu.rs
@@ -0,0 +1,212 @@
+// Copyright 2025 the Vello Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+//! The GPU parts of a hybrid CPU/GPU rendering engine.
+
+use bytemuck::{Pod, Zeroable};
+use vello_api::{peniko::BrushRef, RenderCtx};
+use wgpu::{
+    BindGroupLayout, BlendState, ColorTargetState, ColorWrites, Device, PipelineCompilationOptions,
+    RenderPipeline, TextureFormat,
+};
+
+use crate::{
+    wide_tile::{Cmd, STRIP_HEIGHT, WIDE_TILE_WIDTH},
+    CsRenderCtx,
+};
+
+/// Resources common to GPU renders.
+pub struct GpuSession {
+    pub render_bind_group_layout: BindGroupLayout,
+    pub render_pipeline: RenderPipeline,
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy, Zeroable, Pod)]
+pub struct Strip {
+    x: u16,
+    y: u16,
+    width: u16,
+    dense_width: u16,
+    col: u32,
+    rgba: u32,
+}
+
+/// A render context for a single frame.
+///
+/// This will eventually get a `RenderCtx` trait impl.
+pub struct GpuRenderCtx {
+    // At the moment, we take the entire cpu-sparse render context,
+    // but we might split that up.
+    inner: CsRenderCtx,
+}
+
+/// The buffers from a render.
+///
+/// This being a struct is based on a model where all the buffers are uploaded
+/// up front. That will be replaced by the "submit early and often" model.
+pub struct GpuRenderBufs {
+    pub strips: Vec<Strip>,
+    pub alphas: Vec<u32>,
+}
+
+impl GpuSession {
+    pub fn new(device: &Device, format: TextureFormat) -> Self {
+        let render_shader = device.create_shader_module(wgpu::ShaderModuleDescriptor {
+            label: None,
+            source: wgpu::ShaderSource::Wgsl(include_str!("../shader/render.wgsl").into()),
+        });
+        let render_bind_group_layout =
+            device.create_bind_group_layout(&wgpu::BindGroupLayoutDescriptor {
+                label: None,
+                entries: &[
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 0,
+                        visibility: wgpu::ShaderStages::FRAGMENT,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: true },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 1,
+                        visibility: wgpu::ShaderStages::VERTEX,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Uniform,
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                    wgpu::BindGroupLayoutEntry {
+                        binding: 2,
+                        visibility: wgpu::ShaderStages::VERTEX,
+                        ty: wgpu::BindingType::Buffer {
+                            ty: wgpu::BufferBindingType::Storage { read_only: true },
+                            has_dynamic_offset: false,
+                            min_binding_size: None,
+                        },
+                        count: None,
+                    },
+                ],
+            });
+        let pipeline_layout = device.create_pipeline_layout(&wgpu::PipelineLayoutDescriptor {
+            label: None,
+            bind_group_layouts: &[&render_bind_group_layout],
+            push_constant_ranges: &[],
+        });
+        let render_pipeline = device.create_render_pipeline(&wgpu::RenderPipelineDescriptor {
+            label: None,
+            layout: Some(&pipeline_layout),
+            vertex: wgpu::VertexState {
+                module: &render_shader,
+                entry_point: Some("vs_main"),
+                buffers: &[],
+                compilation_options: PipelineCompilationOptions::default(),
+            },
+            fragment: Some(wgpu::FragmentState {
+                module: &render_shader,
+                entry_point: Some("fs_main"),
+                targets: &[Some(ColorTargetState {
+                    format,
+                    blend: Some(BlendState::PREMULTIPLIED_ALPHA_BLENDING),
+                    write_mask: ColorWrites::ALL,
+                })],
+                compilation_options: PipelineCompilationOptions::default(),
+            }),
+            primitive: wgpu::PrimitiveState {
+                topology: wgpu::PrimitiveTopology::TriangleStrip,
+                ..Default::default()
+            },
+            depth_stencil: None,
+            multisample: wgpu::MultisampleState::default(),
+            multiview: None,
+            cache: None,
+        });
+        Self {
+            render_bind_group_layout,
+            render_pipeline,
+        }
+    }
+}
+
+impl GpuRenderCtx {
+    pub fn new(width: usize, height: usize) -> Self {
+        Self {
+            inner: CsRenderCtx::new(width, height),
+        }
+    }
+
+    pub fn harvest(&self) -> GpuRenderBufs {
+        let mut strips = Vec::new();
+        let width_tiles = (self.inner.width).div_ceil(WIDE_TILE_WIDTH);
+        let height_tiles = (self.inner.height).div_ceil(STRIP_HEIGHT);
+        for y in 0..height_tiles {
+            for x in 0..width_tiles {
+                let tile = &self.inner.tiles[y * width_tiles + x];
+                let tile_x = x * WIDE_TILE_WIDTH;
+                let tile_y = y * STRIP_HEIGHT;
+                let bg = tile.bg.to_rgba8().to_u32();
+                if bg != 0 {
+                    let strip = Strip {
+                        x: tile_x as u16,
+                        y: tile_y as u16,
+                        width: WIDE_TILE_WIDTH as u16,
+                        dense_width: 0,
+                        col: 0,
+                        rgba: bg,
+                    };
+                    strips.push(strip);
+                }
+                for cmd in &tile.cmds {
+                    match cmd {
+                        Cmd::Fill(fill) => {
+                            let strip = Strip {
+                                x: (tile_x as u32 + fill.x) as u16,
+                                y: tile_y as u16,
+                                width: fill.width as u16,
+                                dense_width: 0,
+                                col: 0,
+                                rgba: fill.color.to_rgba8().to_u32(),
+                            };
+                            strips.push(strip);
+                        }
+                        Cmd::Strip(cmd_strip) => {
+                            let strip = Strip {
+                                x: (tile_x as u32 + cmd_strip.x) as u16,
+                                y: tile_y as u16,
+                                width: cmd_strip.width as u16,
+                                dense_width: cmd_strip.width as u16,
+                                col: cmd_strip.alpha_ix as u32,
+                                rgba: cmd_strip.color.to_rgba8().to_u32(),
+                            };
+                            strips.push(strip);
+                        }
+                    }
+                }
+            }
+        }
+        GpuRenderBufs {
+            strips,
+            alphas: self.inner.alphas.clone(),
+        }
+    }
+}
+
+// This block will eventually turn into an impl of RenderCtx.
+impl GpuRenderCtx {
+    pub fn fill(&mut self, path: &vello_api::Path, brush: BrushRef<'_>) {
+        self.inner.fill(path, brush);
+    }
+
+    pub fn stroke(
+        &mut self,
+        path: &vello_api::Path,
+        stroke: &vello_api::peniko::kurbo::Stroke,
+        brush: BrushRef<'_>,
+    ) {
+        self.inner.stroke(path, stroke, brush);
+    }
+}

From 184dd5ba2bc5d535c78e131c536f0faa0157b626 Mon Sep 17 00:00:00 2001
From: Raph Levien <raph.levien@gmail.com>
Date: Thu, 20 Feb 2025 17:01:09 -0800
Subject: [PATCH 5/5] Implement basic clip logic

Adds a clip method to the (CPU) render context, plus a considerable amount of mechanism in coarse and fine rasterization to support clipping.

The coarse rasterization logic contains a similar set of optimizations as Vello. In particular, all-zero tiles have drawing suppressed, and all-one tiles pass drawing commands through with no additional work to clip.

Not extensively validated, but it does render a simple scene with clipping correctly.
---
 vello_hybrid/examples/gpu.rs    |   2 +-
 vello_hybrid/examples/simple.rs |  41 +++--
 vello_hybrid/src/fine.rs        |  58 +++++-
 vello_hybrid/src/gpu.rs         |   4 +-
 vello_hybrid/src/render.rs      | 304 +++++++++++++++++++++++++++++---
 vello_hybrid/src/simd/neon.rs   |  24 +--
 vello_hybrid/src/wide_tile.rs   | 104 ++++++++++-
 7 files changed, 476 insertions(+), 61 deletions(-)

diff --git a/vello_hybrid/examples/gpu.rs b/vello_hybrid/examples/gpu.rs
index eeb6a1ba..433823f2 100644
--- a/vello_hybrid/examples/gpu.rs
+++ b/vello_hybrid/examples/gpu.rs
@@ -61,7 +61,7 @@ async fn run(event_loop: EventLoop<()>, window: Window) {
     surface.configure(&device, &sc);
 
     let session = GpuSession::new(&device, format);
-    // TODO: actually render something
+
     let mut render_ctx = GpuRenderCtx::new(size.width as usize, size.height as usize);
     draw_simple_scene(&mut render_ctx);
     let bufs = render_ctx.harvest();
diff --git a/vello_hybrid/examples/simple.rs b/vello_hybrid/examples/simple.rs
index e91f0175..bae2c10e 100644
--- a/vello_hybrid/examples/simple.rs
+++ b/vello_hybrid/examples/simple.rs
@@ -7,24 +7,16 @@
 use std::io::BufWriter;
 
 use vello_api::peniko::color::palette;
-use vello_api::peniko::kurbo::{BezPath, Stroke};
+use vello_api::peniko::kurbo::{BezPath, Point, Stroke, Vec2};
 use vello_api::RenderCtx;
 use vello_hybrid::{CsRenderCtx, Pixmap};
 
 const WIDTH: usize = 1024;
-const HEIGHT: usize = 256;
+const HEIGHT: usize = 1024;
 
 pub fn main() {
     let mut ctx = CsRenderCtx::new(WIDTH, HEIGHT);
-    let mut path = BezPath::new();
-    path.move_to((10.0, 10.0));
-    path.line_to((180.0, 20.0));
-    path.line_to((30.0, 40.0));
-    path.close_path();
-    let piet_path = path.into();
-    ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into());
-    let stroke = Stroke::new(5.0);
-    ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into());
+    draw_simple_scene(&mut ctx);
     if let Some(filename) = std::env::args().nth(1) {
         let mut pixmap = Pixmap::new(WIDTH, HEIGHT);
         ctx.render_to_pixmap(&mut pixmap);
@@ -39,3 +31,30 @@ pub fn main() {
         ctx.debug_dump();
     }
 }
+
+fn star(center: Point, n: usize, inner: f64, outer: f64) -> BezPath {
+    let mut path = BezPath::new();
+    path.move_to(center + Vec2::new(outer, 0.));
+    for i in 1..n * 2 {
+        let th = i as f64 * std::f64::consts::PI / n as f64;
+        let r = if i % 2 == 0 { outer } else { inner };
+        path.line_to(center + r * Vec2::from_angle(th));
+    }
+    path.close_path();
+    path
+}
+
+fn draw_simple_scene(ctx: &mut CsRenderCtx) {
+    let mut path = BezPath::new();
+    path.move_to((10.0, 10.0));
+    path.line_to((180.0, 20.0));
+    path.line_to((30.0, 180.0));
+    path.close_path();
+    // Note: we plan to change the API to have `into`.
+    let piet_path = path.into();
+    let stroke = Stroke::new(5.0);
+    ctx.stroke(&piet_path, &stroke, palette::css::DARK_BLUE.into());
+    let star_path = star(Point::new(100., 100.), 13, 50., 95.);
+    ctx.clip(&star_path.into());
+    ctx.fill(&piet_path, palette::css::REBECCA_PURPLE.into());
+}
diff --git a/vello_hybrid/src/fine.rs b/vello_hybrid/src/fine.rs
index 4b935528..52b94b05 100644
--- a/vello_hybrid/src/fine.rs
+++ b/vello_hybrid/src/fine.rs
@@ -15,7 +15,7 @@ pub(crate) struct Fine<'a> {
     // f32 RGBA pixels
     // That said, if we use u8, then this is basically a block of
     // untyped memory.
-    pub(crate) scratch: [f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4],
+    pub(crate) scratch: Vec<[f32; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]>,
     #[allow(clippy::doc_markdown, reason = "false positive for x86_64")]
     /// Whether to use SIMD
     ///
@@ -31,7 +31,7 @@ pub(crate) struct Fine<'a> {
 
 impl<'a> Fine<'a> {
     pub(crate) fn new(width: usize, height: usize, out_buf: &'a mut [u8]) -> Self {
-        let scratch = [0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4];
+        let scratch = vec![[0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]];
         Self {
             width,
             height,
@@ -42,7 +42,8 @@ impl<'a> Fine<'a> {
     }
 
     pub(crate) fn clear_scalar(&mut self, color: [f32; 4]) {
-        for z in self.scratch.chunks_exact_mut(4) {
+        let scratch = self.scratch.last_mut().unwrap();
+        for z in scratch.chunks_exact_mut(4) {
             z.copy_from_slice(&color);
         }
     }
@@ -58,12 +59,13 @@ impl<'a> Fine<'a> {
             (y + 1) * STRIP_HEIGHT <= self.height,
             "overflow of pixmap height"
         );
+        let scratch = self.scratch.last_mut().unwrap();
         let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4;
         for j in 0..STRIP_HEIGHT {
             let line_ix = base_ix + j * self.width * 4;
             for i in 0..WIDE_TILE_WIDTH {
                 let mut rgba_f32 = [0.0; 4];
-                rgba_f32.copy_from_slice(&self.scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);
+                rgba_f32.copy_from_slice(&scratch[(i * STRIP_HEIGHT + j) * 4..][..4]);
                 let rgba_u8 = rgba_f32.map(|z| (z * 255.0).round() as u8);
                 self.out_buf[line_ix + i * 4..][..4].copy_from_slice(&rgba_u8);
             }
@@ -79,20 +81,28 @@ impl<'a> Fine<'a> {
                 let aslice = &alphas[s.alpha_ix..];
                 self.strip(s.x as usize, s.width as usize, aslice, s.color.components);
             }
+            Cmd::PushClip => self.scratch.push([0.0; WIDE_TILE_WIDTH * STRIP_HEIGHT * 4]),
+            Cmd::PopClip => _ = self.scratch.pop(),
+            Cmd::ClipFill(f) => {
+                self.clip_fill_scalar(f.x as usize, f.width as usize);
+            }
+            Cmd::ClipStrip(s) => {
+                let aslice = &alphas[s.alpha_ix..];
+                self.clip_strip_scalar(s.x as usize, s.width as usize, aslice);
+            }
         }
     }
 
     pub(crate) fn fill_scalar(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         if color[3] == 1.0 {
-            for z in
-                self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
+            for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
             {
                 z.copy_from_slice(&color);
             }
         } else {
             let one_minus_alpha = 1.0 - color[3];
-            for z in
-                self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
+            for z in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width].chunks_exact_mut(4)
             {
                 for i in 0..4 {
                     //z[i] = color[i] + one_minus_alpha * z[i];
@@ -106,9 +116,10 @@ impl<'a> Fine<'a> {
     }
 
     pub(crate) fn strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32], color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         debug_assert!(alphas.len() >= width, "overflow of alphas buffer");
         let cs = color.map(|z| z * (1.0 / 255.0));
-        for (z, a) in self.scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width]
+        for (z, a) in scratch[x * STRIP_HEIGHT_F32..][..STRIP_HEIGHT_F32 * width]
             .chunks_exact_mut(16)
             .zip(alphas)
         {
@@ -121,4 +132,33 @@ impl<'a> Fine<'a> {
             }
         }
     }
+
+    fn clip_fill_scalar(&mut self, x: usize, width: usize) {
+        let (tos, rest) = self.scratch.split_last_mut().unwrap();
+        let nos = rest.last_mut().unwrap();
+        for i in 0..width {
+            for j in 0..4 {
+                let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4;
+                let one_minus_alpha = 1.0 - tos[ix + 3];
+                for k in 0..4 {
+                    nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, tos[ix + k]);
+                }
+            }
+        }
+    }
+
+    fn clip_strip_scalar(&mut self, x: usize, width: usize, alphas: &[u32]) {
+        let (tos, rest) = self.scratch.split_last_mut().unwrap();
+        let nos = rest.last_mut().unwrap();
+        for (i, a) in alphas.iter().take(width).enumerate() {
+            for j in 0..4 {
+                let ix = (x + i) * STRIP_HEIGHT_F32 + j * 4;
+                let mask_alpha = ((a >> (j * 8)) & 0xff) as f32 * (1. / 255.);
+                let one_minus_alpha = 1.0 - mask_alpha * tos[ix + 3];
+                for k in 0..4 {
+                    nos[ix + k] = nos[ix + k].mul_add(one_minus_alpha, mask_alpha * tos[ix + k]);
+                }
+            }
+        }
+    }
 }
diff --git a/vello_hybrid/src/gpu.rs b/vello_hybrid/src/gpu.rs
index b4ed688a..c5a1b652 100644
--- a/vello_hybrid/src/gpu.rs
+++ b/vello_hybrid/src/gpu.rs
@@ -139,7 +139,8 @@ impl GpuRenderCtx {
         }
     }
 
-    pub fn harvest(&self) -> GpuRenderBufs {
+    pub fn harvest(&mut self) -> GpuRenderBufs {
+        self.inner.finish();
         let mut strips = Vec::new();
         let width_tiles = (self.inner.width).div_ceil(WIDE_TILE_WIDTH);
         let height_tiles = (self.inner.height).div_ceil(STRIP_HEIGHT);
@@ -184,6 +185,7 @@ impl GpuRenderCtx {
                             };
                             strips.push(strip);
                         }
+                        _ => todo!(),
                     }
                 }
             }
diff --git a/vello_hybrid/src/render.rs b/vello_hybrid/src/render.rs
index 1c7bfa67..7ecb8b27 100644
--- a/vello_hybrid/src/render.rs
+++ b/vello_hybrid/src/render.rs
@@ -19,7 +19,7 @@ use crate::{
     fine::Fine,
     strip::{self, Strip, Tile},
     tiling::{self, FlatLine},
-    wide_tile::{Cmd, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH},
+    wide_tile::{Cmd, CmdClipStrip, CmdStrip, WideTile, STRIP_HEIGHT, WIDE_TILE_WIDTH},
     Pixmap,
 };
 
@@ -34,6 +34,22 @@ pub struct CsRenderCtx {
     line_buf: Vec<FlatLine>,
     tile_buf: Vec<Tile>,
     strip_buf: Vec<Strip>,
+
+    state_stack: Vec<GfxState>,
+    clip_stack: Vec<Clip>,
+}
+
+struct GfxState {
+    // TODO: transform goes here (there's logic in piet-ts to copy)
+    n_clip: usize,
+}
+
+struct Clip {
+    // should probably be a bounding box type
+    /// The intersected bounding box after clip
+    clip_bbox: [usize; 4],
+    /// The rendered path in sparse strip representation
+    strips: Vec<Strip>,
 }
 
 pub struct CsResourceCtx;
@@ -45,18 +61,17 @@ impl CsRenderCtx {
         let tiles = (0..width_tiles * height_tiles)
             .map(|_| WideTile::default())
             .collect();
-        let alphas = vec![];
-        let line_buf = vec![];
-        let tile_buf = vec![];
-        let strip_buf = vec![];
+        let state = GfxState { n_clip: 0 };
         Self {
             width,
             height,
             tiles,
-            alphas,
-            line_buf,
-            tile_buf,
-            strip_buf,
+            alphas: vec![],
+            line_buf: vec![],
+            tile_buf: vec![],
+            strip_buf: vec![],
+            state_stack: vec![state],
+            clip_stack: vec![],
         }
     }
 
@@ -67,7 +82,16 @@ impl CsRenderCtx {
         }
     }
 
-    pub fn render_to_pixmap(&self, pixmap: &mut Pixmap) {
+    /// Finish the coarse rasterization prior to fine rendering.
+    ///
+    /// At the moment, this mostly involves resolving any open clips, but
+    /// might extend to other things.
+    pub(crate) fn finish(&mut self) {
+        self.pop_clips();
+    }
+
+    pub fn render_to_pixmap(&mut self, pixmap: &mut Pixmap) {
+        self.finish();
         let mut fine = Fine::new(pixmap.width, pixmap.height, &mut pixmap.buf);
         let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
         let height_tiles = (self.height).div_ceil(STRIP_HEIGHT);
@@ -94,27 +118,43 @@ impl CsRenderCtx {
         println!("total = {total}, {histo:?}");
     }
 
+    /// Render a path to the strip buffer.
+    fn render_path_common(&mut self) {
+        tiling::make_tiles(&self.line_buf, &mut self.tile_buf);
+        self.tile_buf.sort_unstable_by(Tile::cmp);
+        crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas);
+    }
+
     /// Render a path, which has already been flattened into `line_buf`.
     fn render_path(&mut self, brush: BrushRef<'_>) {
         // TODO: need to make sure tiles contained in viewport - we'll likely
         // panic otherwise.
-        tiling::make_tiles(&self.line_buf, &mut self.tile_buf);
-        self.tile_buf.sort_unstable_by(Tile::cmp);
-        crate::simd::render_strips(&self.tile_buf, &mut self.strip_buf, &mut self.alphas);
+        self.render_path_common();
         let color = brush_to_color(brush);
         let width_tiles = self.width.div_ceil(WIDE_TILE_WIDTH);
+        let bbox = self.get_bbox();
         for i in 0..self.strip_buf.len() - 1 {
             let strip = &self.strip_buf[i];
             let next_strip = &self.strip_buf[i + 1];
             let x0 = strip.x();
-            let y = strip.strip_y();
-            let row_start = y as usize * width_tiles;
+            let y = strip.strip_y() as usize;
+            if y < bbox[1] {
+                continue;
+            }
+            if y >= bbox[3] {
+                break;
+            }
+            let row_start = y * width_tiles;
             let strip_width = next_strip.col - strip.col;
             let x1 = x0 + strip_width;
-            let xtile0 = x0 as usize / WIDE_TILE_WIDTH;
-            let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH);
+            let xtile0 = (x0 as usize / WIDE_TILE_WIDTH).max(bbox[0]);
+            let xtile1 = (x1 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]);
             let mut x = x0;
             let mut col = strip.col;
+            if (bbox[0] * WIDE_TILE_WIDTH) as u32 > x {
+                col += (bbox[0] * WIDE_TILE_WIDTH) as u32 - x;
+                x = (bbox[0] * WIDE_TILE_WIDTH) as u32;
+            }
             for xtile in xtile0..xtile1 {
                 let x_tile_rel = x % WIDE_TILE_WIDTH as u32;
                 let width = x1.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x;
@@ -126,13 +166,13 @@ impl CsRenderCtx {
                 };
                 x += width;
                 col += width;
-                self.tiles[row_start + xtile].push(Cmd::Strip(cmd));
+                self.tiles[row_start + xtile].strip(cmd);
             }
-            if next_strip.winding != 0 && y == next_strip.strip_y() {
+            if next_strip.winding != 0 && y == next_strip.strip_y() as usize {
                 x = x1;
                 let x2 = next_strip.x();
-                let fxt0 = x1 as usize / WIDE_TILE_WIDTH;
-                let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH);
+                let fxt0 = (x1 as usize / WIDE_TILE_WIDTH).max(bbox[0]);
+                let fxt1 = (x2 as usize).div_ceil(WIDE_TILE_WIDTH).min(bbox[2]);
                 for xtile in fxt0..fxt1 {
                     let x_tile_rel = x % WIDE_TILE_WIDTH as u32;
                     let width = x2.min(((xtile + 1) * WIDE_TILE_WIDTH) as u32) - x;
@@ -161,6 +201,133 @@ impl CsRenderCtx {
         // TODO: get from graphics state
         Affine::scale(5.0)
     }
+
+    fn get_bbox(&self) -> [usize; 4] {
+        if let Some(tos) = self.clip_stack.last() {
+            tos.clip_bbox
+        } else {
+            let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+            let height_tiles = (self.height).div_ceil(STRIP_HEIGHT);
+            [0, 0, width_tiles, height_tiles]
+        }
+    }
+
+    fn pop_clip(&mut self) {
+        self.state_stack.last_mut().unwrap().n_clip -= 1;
+        let Clip { clip_bbox, strips } = self.clip_stack.pop().unwrap();
+        let n_strips = strips.len();
+        // The next bit of code accomplishes the following. For each tile in
+        // the intersected bounding box, it does one of three things depending
+        // on the contents of the clip path in that tile.
+        // If all-zero: pop a zero_clip.
+        // If all-one: do nothing.
+        // If contains one or more strips: render strips and fills, then pop a clip.
+        // This logic is the inverse of the push logic in `clip()`, and the stack
+        // should be balanced after running both.
+        let mut tile_x = clip_bbox[0];
+        let mut tile_y = clip_bbox[1];
+        let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+        let mut pop_pending = false;
+        for i in 0..n_strips - 1 {
+            let strip = &strips[i];
+            let y = strip.strip_y() as usize;
+            if y < tile_y {
+                continue;
+            }
+            while tile_y < y.min(clip_bbox[3]) {
+                if core::mem::take(&mut pop_pending) {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                    tile_x += 1;
+                }
+                for x in tile_x..clip_bbox[2] {
+                    self.tiles[tile_y * width_tiles + x].pop_zero_clip();
+                }
+                tile_x = clip_bbox[0];
+                tile_y += 1;
+            }
+            if tile_y == clip_bbox[3] {
+                break;
+            }
+            let x0 = strip.x() as usize;
+            let x_clamped = (x0 / WIDE_TILE_WIDTH).min(clip_bbox[2]);
+            if tile_x < x_clamped {
+                if core::mem::take(&mut pop_pending) {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                    tile_x += 1;
+                }
+                // The winding check is probably not needed; if there was a fill,
+                // the logic below should have advanced tile_x.
+                if strip.winding == 0 {
+                    for x in tile_x..x_clamped {
+                        self.tiles[tile_y * width_tiles + x].pop_zero_clip();
+                    }
+                }
+                tile_x = x_clamped;
+            }
+            let next_strip = &strips[i + 1];
+            let strip_width = (next_strip.col - strip.col) as usize;
+            let x1 = x0 + strip_width;
+            let xtile0 = (x0 / WIDE_TILE_WIDTH).max(clip_bbox[0]);
+            let xtile1 = x1.div_ceil(WIDE_TILE_WIDTH).min(clip_bbox[2]);
+            let mut x = x0;
+            let mut alpha_ix = strip.col as usize;
+            if clip_bbox[0] * WIDE_TILE_WIDTH > x {
+                alpha_ix += clip_bbox[0] * WIDE_TILE_WIDTH - x;
+                x = clip_bbox[0] * WIDE_TILE_WIDTH;
+            }
+            for xtile in xtile0..xtile1 {
+                if xtile > tile_x && core::mem::take(&mut pop_pending) {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                }
+                let x_tile_rel = (x % WIDE_TILE_WIDTH) as u32;
+                let width = x1.min((xtile + 1) * WIDE_TILE_WIDTH) - x;
+                let cmd = CmdClipStrip {
+                    x: x_tile_rel,
+                    width: width as u32,
+                    alpha_ix,
+                };
+                x += width;
+                alpha_ix += width;
+                self.tiles[tile_y * width_tiles + xtile].clip_strip(cmd);
+                tile_x = xtile;
+                pop_pending = true;
+            }
+            if next_strip.winding != 0 && y == next_strip.strip_y() as usize {
+                let x2 = next_strip.x() as usize;
+                let tile_x2 = x2.min((tile_x + 1) * WIDE_TILE_WIDTH);
+                let width = tile_x2 - x1;
+                if width > 0 {
+                    let x_tile_rel = (x1 % WIDE_TILE_WIDTH) as u32;
+                    self.tiles[tile_y * width_tiles + tile_x].clip_fill(x_tile_rel, width as u32);
+                }
+                if x2 > (tile_x + 1) * WIDE_TILE_WIDTH {
+                    self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+                    let width2 = x2 % WIDE_TILE_WIDTH;
+                    tile_x = x2 / WIDE_TILE_WIDTH;
+                    if width2 > 0 {
+                        self.tiles[tile_y * width_tiles + tile_x].clip_fill(0, width2 as u32);
+                    }
+                }
+            }
+        }
+        if core::mem::take(&mut pop_pending) {
+            self.tiles[tile_y * width_tiles + tile_x].pop_clip();
+            tile_x += 1;
+        }
+        while tile_y < clip_bbox[3] {
+            for x in tile_x..clip_bbox[2] {
+                self.tiles[tile_y * width_tiles + x].pop_zero_clip();
+            }
+            tile_x = clip_bbox[0];
+            tile_y += 1;
+        }
+    }
+
+    fn pop_clips(&mut self) {
+        while self.state_stack.last().unwrap().n_clip > 0 {
+            self.pop_clip();
+        }
+    }
 }
 
 impl RenderCtx for CsRenderCtx {
@@ -200,15 +367,104 @@ impl RenderCtx for CsRenderCtx {
     }
 
     fn clip(&mut self, path: &vello_api::Path) {
-        todo!()
+        let affine = self.get_affine();
+        crate::flatten::fill(&path.path, affine, &mut self.line_buf);
+        self.render_path_common();
+        let strips = core::mem::take(&mut self.strip_buf);
+        let n_strips = strips.len();
+        let path_bbox = if n_strips <= 1 {
+            [0, 0, 0, 0]
+        } else {
+            let y0 = strips[0].strip_y() as usize;
+            let y1 = strips[n_strips - 1].strip_y() as usize + 1;
+            let mut x0 = strips[0].x() as usize / WIDE_TILE_WIDTH;
+            let mut x1 = x0;
+            for i in 0..n_strips - 1 {
+                let strip = &strips[i];
+                let next_strip = &strips[i + 1];
+                let width = next_strip.col - strip.col;
+                let x = strip.x() as usize;
+                x0 = x0.min(x / WIDE_TILE_WIDTH);
+                x1 = x1.max((x + width as usize).div_ceil(WIDE_TILE_WIDTH));
+            }
+            [x0, x1, y0, y1]
+        };
+        let parent_bbox = self.get_bbox();
+        // intersect clip bounding box
+        let clip_bbox = [
+            parent_bbox[0].max(path_bbox[0]),
+            parent_bbox[1].max(path_bbox[1]),
+            parent_bbox[2].min(path_bbox[2]),
+            parent_bbox[3].min(path_bbox[3]),
+        ];
+        // The next bit of code accomplishes the following. For each tile in
+        // the intersected bounding box, it does one of three things depending
+        // on the contents of the clip path in that tile.
+        // If all-zero: push a zero_clip
+        // If all-one: do nothing
+        // If contains one or more strips: push a clip
+        let mut tile_x = clip_bbox[0];
+        let mut tile_y = clip_bbox[1];
+        let width_tiles = (self.width).div_ceil(WIDE_TILE_WIDTH);
+        for i in 0..n_strips - 1 {
+            let strip = &strips[i];
+            let y = strip.strip_y() as usize;
+            if y < tile_y {
+                continue;
+            }
+            while tile_y < y.min(clip_bbox[3]) {
+                for x in tile_x..clip_bbox[2] {
+                    self.tiles[tile_y * width_tiles + x].push_zero_clip();
+                }
+                tile_x = clip_bbox[0];
+                tile_y += 1;
+            }
+            if tile_y == clip_bbox[3] {
+                break;
+            }
+            let x_pixels = strip.x() as usize;
+            let x_clamped = (x_pixels / WIDE_TILE_WIDTH).min(clip_bbox[2]);
+            if tile_x < x_clamped {
+                if strip.winding == 0 {
+                    for x in tile_x..x_clamped {
+                        self.tiles[tile_y * width_tiles + x].push_zero_clip();
+                    }
+                }
+                // If winding is nonzero, then wide tiles covered entirely
+                // by sparse fill are no-op (no clipping is applied).
+                tile_x = x_clamped;
+            }
+            let next_strip = &strips[i + 1];
+            let width = (next_strip.col - strip.col) as usize;
+            let x1 = (x_pixels + width)
+                .div_ceil(WIDE_TILE_WIDTH)
+                .min(clip_bbox[2]);
+            if tile_x < x1 {
+                for x in tile_x..x1 {
+                    self.tiles[tile_y * width_tiles + x].push_clip();
+                }
+                tile_x = x1;
+            }
+        }
+        while tile_y < clip_bbox[3] {
+            for x in tile_x..clip_bbox[2] {
+                self.tiles[tile_y * width_tiles + x].push_zero_clip();
+            }
+            tile_x = clip_bbox[0];
+            tile_y += 1;
+        }
+        let clip = Clip { clip_bbox, strips };
+        self.clip_stack.push(clip);
+        self.state_stack.last_mut().unwrap().n_clip += 1;
     }
 
     fn save(&mut self) {
-        todo!()
+        self.state_stack.push(GfxState { n_clip: 0 });
     }
 
     fn restore(&mut self) {
-        todo!()
+        self.pop_clips();
+        self.state_stack.pop();
     }
 
     fn transform(&mut self, affine: vello_api::peniko::kurbo::Affine) {
diff --git a/vello_hybrid/src/simd/neon.rs b/vello_hybrid/src/simd/neon.rs
index 72bf9202..777b8094 100644
--- a/vello_hybrid/src/simd/neon.rs
+++ b/vello_hybrid/src/simd/neon.rs
@@ -14,16 +14,18 @@ use crate::{
 
 impl Fine<'_> {
     pub(crate) unsafe fn clear_simd(&mut self, color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe {
             let v_color = vld1q_f32(color.as_ptr());
             let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color);
             for i in 0..WIDE_TILE_WIDTH {
-                vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4);
+                vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4);
             }
         }
     }
 
     pub(crate) fn pack_simd(&mut self, x: usize, y: usize) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe fn cvt(v: float32x4_t) -> uint8x16_t {
             unsafe {
                 let clamped = vminq_f32(v, vdupq_n_f32(1.0));
@@ -40,14 +42,14 @@ impl Fine<'_> {
             let base_ix = (y * STRIP_HEIGHT * self.width + x * WIDE_TILE_WIDTH) * 4;
             for i in (0..WIDE_TILE_WIDTH).step_by(4) {
                 let chunk_ix = base_ix + i * 4;
-                let v0 = vld1q_f32_x4(self.scratch.as_ptr().add(i * 16));
-                let v1 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 1) * 16));
+                let v0 = vld1q_f32_x4(scratch.as_ptr().add(i * 16));
+                let v1 = vld1q_f32_x4(scratch.as_ptr().add((i + 1) * 16));
                 let x0 = cvt2(v0.0, v1.0);
                 let x1 = cvt2(v0.1, v1.1);
                 let x2 = cvt2(v0.2, v1.2);
                 let x3 = cvt2(v0.3, v1.3);
-                let v2 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 2) * 16));
-                let v3 = vld1q_f32_x4(self.scratch.as_ptr().add((i + 3) * 16));
+                let v2 = vld1q_f32_x4(scratch.as_ptr().add((i + 2) * 16));
+                let v3 = vld1q_f32_x4(scratch.as_ptr().add((i + 3) * 16));
                 let x4 = cvt2(v2.0, v3.0);
                 let y0 = vuzp1q_u8(x0, x4);
                 vst1q_u8(self.out_buf.as_mut_ptr().add(chunk_ix), y0);
@@ -68,24 +70,25 @@ impl Fine<'_> {
     }
 
     pub(crate) unsafe fn fill_simd(&mut self, x: usize, width: usize, color: [f32; 4]) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe {
             let v_color = vld1q_f32(color.as_ptr());
             let alpha = color[3];
             if alpha == 1.0 {
                 let v_color_4 = float32x4x4_t(v_color, v_color, v_color, v_color);
                 for i in x..x + width {
-                    vst1q_f32_x4(self.scratch.as_mut_ptr().add(i * 16), v_color_4);
+                    vst1q_f32_x4(scratch.as_mut_ptr().add(i * 16), v_color_4);
                 }
             } else {
                 let one_minus_alpha = vdupq_n_f32(1.0 - alpha);
                 for i in x..x + width {
                     let ix = (x + i) * 16;
-                    let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix));
+                    let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix));
                     v.0 = vfmaq_f32(v_color, v.0, one_minus_alpha);
                     v.1 = vfmaq_f32(v_color, v.1, one_minus_alpha);
                     v.2 = vfmaq_f32(v_color, v.2, one_minus_alpha);
                     v.3 = vfmaq_f32(v_color, v.3, one_minus_alpha);
-                    vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v);
+                    vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v);
                 }
             }
         }
@@ -99,6 +102,7 @@ impl Fine<'_> {
         alphas: &[u32],
         color: [f32; 4],
     ) {
+        let scratch = self.scratch.last_mut().unwrap();
         unsafe {
             debug_assert!(alphas.len() >= width, "overflow of alphas buffer");
             let v_color = vmulq_f32(vld1q_f32(color.as_ptr()), vdupq_n_f32(1.0 / 255.0));
@@ -111,13 +115,13 @@ impl Fine<'_> {
                 let a4 = vreinterpretq_u32_u16(vzip1q_u16(a3, vdupq_n_u16(0)));
                 let alpha = vcvtq_f32_u32(a4);
                 let ix = (x + i) * 16;
-                let mut v = vld1q_f32_x4(self.scratch.as_ptr().add(ix));
+                let mut v = vld1q_f32_x4(scratch.as_ptr().add(ix));
                 let one_minus_alpha = vfmsq_laneq_f32(vdupq_n_f32(1.0), alpha, v_color, 3);
                 v.0 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 0), v.0, one_minus_alpha, 0);
                 v.1 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 1), v.1, one_minus_alpha, 1);
                 v.2 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 2), v.2, one_minus_alpha, 2);
                 v.3 = vfmaq_laneq_f32(vmulq_laneq_f32(v_color, alpha, 3), v.3, one_minus_alpha, 3);
-                vst1q_f32_x4(self.scratch.as_mut_ptr().add(ix), v);
+                vst1q_f32_x4(scratch.as_mut_ptr().add(ix), v);
             }
         }
     }
diff --git a/vello_hybrid/src/wide_tile.rs b/vello_hybrid/src/wide_tile.rs
index 5e71153e..9d8dbcd6 100644
--- a/vello_hybrid/src/wide_tile.rs
+++ b/vello_hybrid/src/wide_tile.rs
@@ -6,15 +6,24 @@ use vello_api::peniko::color::{AlphaColor, Srgb};
 pub(crate) const WIDE_TILE_WIDTH: usize = 256;
 pub(crate) const STRIP_HEIGHT: usize = 4;
 
+#[derive(Debug)]
 pub(crate) struct WideTile {
     pub(crate) bg: AlphaColor<Srgb>,
     pub(crate) cmds: Vec<Cmd>,
+    n_zero_clip: usize,
+    n_clip: usize,
 }
 
 #[derive(Debug)]
 pub(crate) enum Cmd {
     Fill(CmdFill),
     Strip(CmdStrip),
+    /// Pushes a new transparent buffer to the clip stack.
+    PushClip,
+    /// Pops the clip stack.
+    PopClip,
+    ClipFill(CmdClipFill),
+    ClipStrip(CmdClipStrip),
 }
 
 #[derive(Debug)]
@@ -33,26 +42,111 @@ pub(crate) struct CmdStrip {
     pub(crate) color: AlphaColor<Srgb>,
 }
 
+/// Same as fill, but copies top of clip stack to next on stack.
+#[derive(Debug)]
+pub(crate) struct CmdClipFill {
+    pub(crate) x: u32,
+    pub(crate) width: u32,
+    // TODO: this should probably get at least an alpha for group opacity
+    // Also, this is where blend modes go.
+}
+
+/// Same as strip, but composites top of clip stack to next on stack.
+#[derive(Debug)]
+pub(crate) struct CmdClipStrip {
+    pub(crate) x: u32,
+    pub(crate) width: u32,
+    pub(crate) alpha_ix: usize,
+    // See `CmdClipFill` for blending extension points
+}
+
 impl Default for WideTile {
     fn default() -> Self {
         Self {
             bg: AlphaColor::TRANSPARENT,
             cmds: vec![],
+            n_zero_clip: 0,
+            n_clip: 0,
         }
     }
 }
 
 impl WideTile {
     pub(crate) fn fill(&mut self, x: u32, width: u32, color: AlphaColor<Srgb>) {
-        if x == 0 && width == WIDE_TILE_WIDTH as u32 && color.components[3] == 1.0 {
-            self.cmds.clear();
-            self.bg = color;
-        } else {
-            self.cmds.push(Cmd::Fill(CmdFill { x, width, color }));
+        if !self.is_zero_clip() {
+            // Note that we could be more aggressive in optimizing a whole-tile opaque fill
+            // even with a clip stack. It would be valid to elide all drawing commands from
+            // the enclosing clip push up to the fill. Further, we could extend the clip
+            // push command to include a background color, rather than always starting with
+            // a transparent buffer. Lastly, a sequence of push(bg); strip/fill; pop could
+            // be replaced with strip/fill with the color (the latter is true even with a
+            // non-opaque color).
+            //
+            // However, the extra cost of tracking such optimizations may outweigh the
+            // benefit, especially in hybrid mode with GPU painting.
+            if x == 0
+                && width == WIDE_TILE_WIDTH as u32
+                && color.components[3] == 1.0
+                && self.n_clip == 0
+            {
+                self.cmds.clear();
+                self.bg = color;
+            } else {
+                self.cmds.push(Cmd::Fill(CmdFill { x, width, color }));
+            }
+        }
+    }
+
+    pub(crate) fn strip(&mut self, cmd_strip: CmdStrip) {
+        if !self.is_zero_clip() {
+            self.cmds.push(Cmd::Strip(cmd_strip));
         }
     }
 
     pub(crate) fn push(&mut self, cmd: Cmd) {
         self.cmds.push(cmd);
     }
+
+    pub(crate) fn push_clip(&mut self) {
+        if !self.is_zero_clip() {
+            self.push(Cmd::PushClip);
+            self.n_clip += 1;
+        }
+    }
+
+    pub(crate) fn pop_clip(&mut self) {
+        if !self.is_zero_clip() {
+            if matches!(self.cmds.last(), Some(Cmd::PushClip)) {
+                // Nothing was drawn inside the clip, elide it.
+                self.cmds.pop();
+            } else {
+                self.push(Cmd::PopClip);
+            }
+            self.n_clip -= 1;
+        }
+    }
+
+    pub(crate) fn push_zero_clip(&mut self) {
+        self.n_zero_clip += 1;
+    }
+
+    pub(crate) fn pop_zero_clip(&mut self) {
+        self.n_zero_clip -= 1;
+    }
+
+    pub(crate) fn is_zero_clip(&mut self) -> bool {
+        self.n_zero_clip > 0
+    }
+
+    pub(crate) fn clip_strip(&mut self, cmd_clip_strip: CmdClipStrip) {
+        if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) {
+            self.cmds.push(Cmd::ClipStrip(cmd_clip_strip));
+        }
+    }
+
+    pub(crate) fn clip_fill(&mut self, x: u32, width: u32) {
+        if !self.is_zero_clip() && !matches!(self.cmds.last(), Some(Cmd::PushClip)) {
+            self.cmds.push(Cmd::ClipFill(CmdClipFill { x, width }));
+        }
+    }
 }