diff --git a/src/stb_image.h b/src/stb_image.h
index 1b4b337328ed..1c2096a3a0d7 100644
--- a/src/stb_image.h
+++ b/src/stb_image.h
@@ -786,6 +786,20 @@ static int stbi__sse2_available(void)
 #endif
 #endif
 
+// RISC-V VECTOR
+#if defined(STBI_NO_SIMD) && defined(STBI_RVV)
+#undef STBI_RVV
+#endif
+
+#ifdef STBI_RVV
+#include <riscv_vector.h>
+#ifdef _MSC_VER
+#define STBI_SIMD_ALIGN(type, name) __declspec(align(16)) type name
+#else
+#define STBI_SIMD_ALIGN(type, name) type name __attribute__((aligned(16)))
+#endif
+#endif
+
 #ifndef STBI_SIMD_ALIGN
 #define STBI_SIMD_ALIGN(type, name) type name
 #endif
@@ -2910,6 +2924,180 @@ static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
 
 #endif // STBI_NEON
 
+#ifdef STBI_RVV
+
+// risc-v vector integer IDCT. should produce bit-identical
+// results to the generic C version.
+static void stbi__idct_simd(stbi_uc *out, int out_stride, short data[64])
+{
+   const short rot0_0 = stbi__f2f(0.5411961f);
+   const short rot0_1 = stbi__f2f(-1.847759065f);
+   const short rot0_2 = stbi__f2f( 0.765366865f);
+   const short rot1_0 = stbi__f2f( 1.175875602f);
+   const short rot1_1 = stbi__f2f(-0.899976223f);
+   const short rot1_2 = stbi__f2f(-2.562915447f);
+   const short rot2_0 = stbi__f2f(-1.961570560f);
+   const short rot2_1 = stbi__f2f(-0.390180644f);
+   const short rot3_0 = stbi__f2f( 0.298631336f);
+   const short rot3_1 = stbi__f2f( 2.053119869f);
+   const short rot3_2 = stbi__f2f( 3.072711026f);
+   const short rot3_3 = stbi__f2f( 1.501321110f);
+
+   // scratch buffer for data transpose
+   short tmp[64];
+
+   const size_t vl = vsetvl_e16m1(8);
+
+   // column pass
+   {
+      vint16m1_t row0 = vle16_v_i16m1(data + 0*8, vl);
+      vint16m1_t row1 = vle16_v_i16m1(data + 1*8, vl);
+      vint16m1_t row2 = vle16_v_i16m1(data + 2*8, vl);
+      vint16m1_t row3 = vle16_v_i16m1(data + 3*8, vl);
+      vint16m1_t row4 = vle16_v_i16m1(data + 4*8, vl);
+      vint16m1_t row5 = vle16_v_i16m1(data + 5*8, vl);
+      vint16m1_t row6 = vle16_v_i16m1(data + 6*8, vl);
+      vint16m1_t row7 = vle16_v_i16m1(data + 7*8, vl);
+
+      // even part
+      vint16m1_t sum26 = vadd_vv_i16m1(row2, row6, vl);
+      vint32m2_t p1e = vwmul_vx_i32m2(sum26, rot0_0, vl);
+      vint32m2_t t2e = vwmacc_vx_i32m2(p1e, rot0_1, row6, vl);
+      vint32m2_t t3e = vwmacc_vx_i32m2(p1e, rot0_2, row2, vl);
+      vint32m2_t t0e = vsll_vx_i32m2(vwadd_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t t1e = vsll_vx_i32m2(vwsub_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t x0 = vadd_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x3 = vsub_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x1 = vadd_vv_i32m2(t1e, t2e, vl);
+      vint32m2_t x2 = vsub_vv_i32m2(t1e, t2e, vl);
+
+      // odd part
+      vint16m1_t sum15 = vadd_vv_i16m1(row1, row5, vl);
+      vint16m1_t sum17 = vadd_vv_i16m1(row1, row7, vl);
+      vint16m1_t sum35 = vadd_vv_i16m1(row3, row5, vl);
+      vint16m1_t sum37 = vadd_vv_i16m1(row3, row7, vl);
+      vint32m2_t p5o = vwmul_vx_i32m2(vadd_vv_i16m1(sum17, sum35, vl), rot1_0, vl);
+      vint32m2_t p1o = vwmacc_vx_i32m2(p5o, rot1_1, sum17, vl);
+      vint32m2_t p2o = vwmacc_vx_i32m2(p5o, rot1_2, sum35, vl);
+      vint32m2_t p3o = vwmul_vx_i32m2(sum37, rot2_0, vl);
+      vint32m2_t p4o = vwmul_vx_i32m2(sum15, rot2_1, vl);
+      vint32m2_t sump13o = vadd_vv_i32m2(p1o, p3o, vl);
+      vint32m2_t sump24o = vadd_vv_i32m2(p2o, p4o, vl);
+      vint32m2_t sump23o = vadd_vv_i32m2(p2o, p3o, vl);
+      vint32m2_t sump14o = vadd_vv_i32m2(p1o, p4o, vl);
+      vint32m2_t x4 = vwmacc_vx_i32m2(sump13o, rot3_0, row7, vl);
+      vint32m2_t x5 = vwmacc_vx_i32m2(sump24o, rot3_1, row5, vl);
+      vint32m2_t x6 = vwmacc_vx_i32m2(sump23o, rot3_2, row3, vl);
+      vint32m2_t x7 = vwmacc_vx_i32m2(sump14o, rot3_3, row1, vl);
+
+      // bfly32
+      x0 = vadd_vx_i32m2(x0, 512, vl);
+      x1 = vadd_vx_i32m2(x1, 512, vl);
+      x2 = vadd_vx_i32m2(x2, 512, vl);
+      x3 = vadd_vx_i32m2(x3, 512, vl);
+      vint16m1_t out0 = vnsra_wx_i16m1(vadd_vv_i32m2(x0, x7, vl), 10, vl);
+      vint16m1_t out7 = vnsra_wx_i16m1(vsub_vv_i32m2(x0, x7, vl), 10, vl);
+      vint16m1_t out1 = vnsra_wx_i16m1(vadd_vv_i32m2(x1, x6, vl), 10, vl);
+      vint16m1_t out6 = vnsra_wx_i16m1(vsub_vv_i32m2(x1, x6, vl), 10, vl);
+      vint16m1_t out2 = vnsra_wx_i16m1(vadd_vv_i32m2(x2, x5, vl), 10, vl);
+      vint16m1_t out5 = vnsra_wx_i16m1(vsub_vv_i32m2(x2, x5, vl), 10, vl);
+      vint16m1_t out3 = vnsra_wx_i16m1(vadd_vv_i32m2(x3, x4, vl), 10, vl);
+      vint16m1_t out4 = vnsra_wx_i16m1(vsub_vv_i32m2(x3, x4, vl), 10, vl);
+
+      // 8x8 transpose
+      // I would prefer to implement this transpose in register without save+load,
+      // but rvv does not have shuffle/zip instructions among multiple registers.
+      // what a pity :(   --- nihui
+      vsse16_v_i16m1(tmp + 0, 8 * 2, out0, vl);
+      vsse16_v_i16m1(tmp + 1, 8 * 2, out1, vl);
+      vsse16_v_i16m1(tmp + 2, 8 * 2, out2, vl);
+      vsse16_v_i16m1(tmp + 3, 8 * 2, out3, vl);
+      vsse16_v_i16m1(tmp + 4, 8 * 2, out4, vl);
+      vsse16_v_i16m1(tmp + 5, 8 * 2, out5, vl);
+      vsse16_v_i16m1(tmp + 6, 8 * 2, out6, vl);
+      vsse16_v_i16m1(tmp + 7, 8 * 2, out7, vl);
+   }
+
+   // row pass
+   {
+      vint16m1_t row0 = vle16_v_i16m1(tmp + 0*8, vl);
+      vint16m1_t row1 = vle16_v_i16m1(tmp + 1*8, vl);
+      vint16m1_t row2 = vle16_v_i16m1(tmp + 2*8, vl);
+      vint16m1_t row3 = vle16_v_i16m1(tmp + 3*8, vl);
+      vint16m1_t row4 = vle16_v_i16m1(tmp + 4*8, vl);
+      vint16m1_t row5 = vle16_v_i16m1(tmp + 5*8, vl);
+      vint16m1_t row6 = vle16_v_i16m1(tmp + 6*8, vl);
+      vint16m1_t row7 = vle16_v_i16m1(tmp + 7*8, vl);
+
+      // even part
+      vint16m1_t sum26 = vadd_vv_i16m1(row2, row6, vl);
+      vint32m2_t p1e = vwmul_vx_i32m2(sum26, rot0_0, vl);
+      vint32m2_t t2e = vwmacc_vx_i32m2(p1e, rot0_1, row6, vl);
+      vint32m2_t t3e = vwmacc_vx_i32m2(p1e, rot0_2, row2, vl);
+      vint32m2_t t0e = vsll_vx_i32m2(vwadd_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t t1e = vsll_vx_i32m2(vwsub_vv_i32m2(row0, row4, vl), 12, vl);
+      vint32m2_t x0 = vadd_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x3 = vsub_vv_i32m2(t0e, t3e, vl);
+      vint32m2_t x1 = vadd_vv_i32m2(t1e, t2e, vl);
+      vint32m2_t x2 = vsub_vv_i32m2(t1e, t2e, vl);
+
+      // odd part
+      vint16m1_t sum15 = vadd_vv_i16m1(row1, row5, vl);
+      vint16m1_t sum17 = vadd_vv_i16m1(row1, row7, vl);
+      vint16m1_t sum35 = vadd_vv_i16m1(row3, row5, vl);
+      vint16m1_t sum37 = vadd_vv_i16m1(row3, row7, vl);
+      vint32m2_t p5o = vwmul_vx_i32m2(vadd_vv_i16m1(sum17, sum35, vl), rot1_0, vl);
+      vint32m2_t p1o = vwmacc_vx_i32m2(p5o, rot1_1, sum17, vl);
+      vint32m2_t p2o = vwmacc_vx_i32m2(p5o, rot1_2, sum35, vl);
+      vint32m2_t p3o = vwmul_vx_i32m2(sum37, rot2_0, vl);
+      vint32m2_t p4o = vwmul_vx_i32m2(sum15, rot2_1, vl);
+      vint32m2_t sump13o = vadd_vv_i32m2(p1o, p3o, vl);
+      vint32m2_t sump24o = vadd_vv_i32m2(p2o, p4o, vl);
+      vint32m2_t sump23o = vadd_vv_i32m2(p2o, p3o, vl);
+      vint32m2_t sump14o = vadd_vv_i32m2(p1o, p4o, vl);
+      vint32m2_t x4 = vwmacc_vx_i32m2(sump13o, rot3_0, row7, vl);
+      vint32m2_t x5 = vwmacc_vx_i32m2(sump24o, rot3_1, row5, vl);
+      vint32m2_t x6 = vwmacc_vx_i32m2(sump23o, rot3_2, row3, vl);
+      vint32m2_t x7 = vwmacc_vx_i32m2(sump14o, rot3_3, row1, vl);
+
+      // bfly32
+      x0 = vadd_vx_i32m2(x0, (int)(65536 + (128<<17)), vl);
+      x1 = vadd_vx_i32m2(x1, (int)(65536 + (128<<17)), vl);
+      x2 = vadd_vx_i32m2(x2, (int)(65536 + (128<<17)), vl);
+      x3 = vadd_vx_i32m2(x3, (int)(65536 + (128<<17)), vl);
+      vint16m1_t out0 = vnsra_wx_i16m1(vadd_vv_i32m2(x0, x7, vl), 17, vl);
+      vint16m1_t out7 = vnsra_wx_i16m1(vsub_vv_i32m2(x0, x7, vl), 17, vl);
+      vint16m1_t out1 = vnsra_wx_i16m1(vadd_vv_i32m2(x1, x6, vl), 17, vl);
+      vint16m1_t out6 = vnsra_wx_i16m1(vsub_vv_i32m2(x1, x6, vl), 17, vl);
+      vint16m1_t out2 = vnsra_wx_i16m1(vadd_vv_i32m2(x2, x5, vl), 17, vl);
+      vint16m1_t out5 = vnsra_wx_i16m1(vsub_vv_i32m2(x2, x5, vl), 17, vl);
+      vint16m1_t out3 = vnsra_wx_i16m1(vadd_vv_i32m2(x3, x4, vl), 17, vl);
+      vint16m1_t out4 = vnsra_wx_i16m1(vsub_vv_i32m2(x3, x4, vl), 17, vl);
+
+      // clamp 0~255
+      vuint8m1_t out0u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out0, 0, vl)), 0, vl);
+      vuint8m1_t out7u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out7, 0, vl)), 0, vl);
+      vuint8m1_t out1u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out1, 0, vl)), 0, vl);
+      vuint8m1_t out6u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out6, 0, vl)), 0, vl);
+      vuint8m1_t out2u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out2, 0, vl)), 0, vl);
+      vuint8m1_t out5u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out5, 0, vl)), 0, vl);
+      vuint8m1_t out3u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out3, 0, vl)), 0, vl);
+      vuint8m1_t out4u8 = vnclipu_wx_u8m1(vreinterpret_v_i16m1_u16m2(vmax_vx_i16m1(out4, 0, vl)), 0, vl);
+
+      // 8x8 transpose
+      vsse8_v_u8m1(out + 0, out_stride, out0u8, vl);
+      vsse8_v_u8m1(out + 1, out_stride, out1u8, vl);
+      vsse8_v_u8m1(out + 2, out_stride, out2u8, vl);
+      vsse8_v_u8m1(out + 3, out_stride, out3u8, vl);
+      vsse8_v_u8m1(out + 4, out_stride, out4u8, vl);
+      vsse8_v_u8m1(out + 5, out_stride, out5u8, vl);
+      vsse8_v_u8m1(out + 6, out_stride, out6u8, vl);
+      vsse8_v_u8m1(out + 7, out_stride, out7u8, vl);
+   }
+}
+
+#endif // STBI_RVV
+
 #define STBI__MARKER_none  0xff
 // if there's a pending marker from the entropy stream, return that
 // otherwise, fetch from the stream and get a marker. if there's no
@@ -3524,7 +3712,7 @@ static stbi_uc *stbi__resample_row_hv_2(stbi_uc *out, stbi_uc *in_near, stbi_uc
    return out;
 }
 
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_RVV)
 static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stbi_uc *in_far, int w, int hs)
 {
    // need to generate 2x2 samples for every one in input
@@ -3536,6 +3724,48 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
    }
 
    t1 = 3*in_near[0] + in_far[0];
+#if defined(STBI_RVV)
+   // process groups of vl*4 pixels for as long as we can.
+   // note we can't handle the last pixel in a row in this loop
+   // because we need to handle the filter boundary conditions.
+   int n = w-1;
+   while (n > 0) {
+      size_t vl = vsetvl_e8m4(n);
+
+      // load and perform the vertical filtering pass
+      vuint8m4_t farb  = vle8_v_u8m4(in_far + i, vl);
+      vuint8m4_t nearb = vle8_v_u8m4(in_near + i, vl);
+      vuint16m8_t curr  = vadd_vv_u16m8(vwmulu_vx_u16m8(nearb, 3, vl), vwcvtu_x_x_v_u16m8(farb, vl), vl); // current row
+
+      // horizontal filter works the same based on shifted vers of current
+      // row. "prev" is current row shifted right by 1 pixel; we need to
+      // insert the previous pixel value (from t1).
+      // "next" is current row shifted left by 1 pixel, with first pixel
+      // of next block of vl*4 pixels added in.
+      vuint16m8_t prev = vslide1up_vx_u16m8(curr, t1, vl);
+      vuint16m8_t next = vslide1down_vx_u16m8(curr, 3*in_near[i+vl] + in_far[i+vl], vl);
+
+      // horizontal filter, polyphase implementation since it's convenient:
+      // even pixels = 3*cur + prev + 8
+      // odd  pixels = 3*cur + next + 8
+      // note the shared term.
+      vuint16m8_t curs = vmacc_vx_u16m8(vmv_v_x_u16m8(8, vl), 3, curr, vl);
+      vuint16m8_t even = vadd_vv_u16m8(curs, prev, vl);
+      vuint16m8_t odd  = vadd_vv_u16m8(curs, next, vl);
+
+      // undo scaling and round, then store with even/odd phases interleaved
+      vuint8m4_t evenu8 = vnclipu_wx_u8m4(vsrl_vx_u16m8(even, 4, vl), 0, vl);
+      vuint8m4_t oddu8 = vnclipu_wx_u8m4(vsrl_vx_u16m8(odd, 4, vl), 0, vl);
+      vuint8m4x2_t o = vcreate_u8m4x2(evenu8, oddu8);
+      vsseg2e8_v_u8m4x2(out + i*2, o, vl);
+
+      // "previous" value for next iter
+      t1 = 3*in_near[i+(vl-1)] + in_far[i+(vl-1)];
+
+      i += vl;
+      n -= vl;
+   }
+#else
    // process groups of 8 pixels for as long as we can.
    // note we can't handle the last pixel in a row in this loop
    // because we need to handle the filter boundary conditions.
@@ -3622,6 +3852,7 @@ static stbi_uc *stbi__resample_row_hv_2_simd(stbi_uc *out, stbi_uc *in_near, stb
       // "previous" value for next iter
       t1 = 3*in_near[i+7] + in_far[i+7];
    }
+#endif
 
    t0 = t1;
    t1 = 3*in_near[i] + in_far[i];
@@ -3680,7 +3911,7 @@ static void stbi__YCbCr_to_RGB_row(stbi_uc *out, const stbi_uc *y, const stbi_uc
    }
 }
 
-#if defined(STBI_SSE2) || defined(STBI_NEON)
+#if defined(STBI_SSE2) || defined(STBI_NEON) || defined(STBI_RVV)
 static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc const *pcb, stbi_uc const *pcr, int count, int step)
 {
    int i = 0;
@@ -3747,7 +3978,47 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
 #endif
 
 #ifdef STBI_NEON
-   // in this version, step=3 support would be easy to add. but is there demand?
+   if (step == 3) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      uint8x8_t signflip = vdup_n_u8(0x80);
+      int16x8_t cr_const0 = vdupq_n_s16(   (short) ( 1.40200f*4096.0f+0.5f));
+      int16x8_t cr_const1 = vdupq_n_s16( - (short) ( 0.71414f*4096.0f+0.5f));
+      int16x8_t cb_const0 = vdupq_n_s16( - (short) ( 0.34414f*4096.0f+0.5f));
+      int16x8_t cb_const1 = vdupq_n_s16(   (short) ( 1.77200f*4096.0f+0.5f));
+
+      for (; i+7 < count; i += 8) {
+         // load
+         uint8x8_t y_bytes  = vld1_u8(y + i);
+         uint8x8_t cr_bytes = vld1_u8(pcr + i);
+         uint8x8_t cb_bytes = vld1_u8(pcb + i);
+         int8x8_t cr_biased = vreinterpret_s8_u8(vsub_u8(cr_bytes, signflip));
+         int8x8_t cb_biased = vreinterpret_s8_u8(vsub_u8(cb_bytes, signflip));
+
+         // expand to s16
+         int16x8_t yws = vreinterpretq_s16_u16(vshll_n_u8(y_bytes, 4));
+         int16x8_t crw = vshll_n_s8(cr_biased, 7);
+         int16x8_t cbw = vshll_n_s8(cb_biased, 7);
+
+         // color transform
+         int16x8_t cr0 = vqdmulhq_s16(crw, cr_const0);
+         int16x8_t cb0 = vqdmulhq_s16(cbw, cb_const0);
+         int16x8_t cr1 = vqdmulhq_s16(crw, cr_const1);
+         int16x8_t cb1 = vqdmulhq_s16(cbw, cb_const1);
+         int16x8_t rws = vaddq_s16(yws, cr0);
+         int16x8_t gws = vaddq_s16(vaddq_s16(yws, cb0), cr1);
+         int16x8_t bws = vaddq_s16(yws, cb1);
+
+         // undo scaling, round, convert to byte
+         uint8x8x3_t o;
+         o.val[0] = vqrshrun_n_s16(rws, 4);
+         o.val[1] = vqrshrun_n_s16(gws, 4);
+         o.val[2] = vqrshrun_n_s16(bws, 4);
+
+         // store, interleaving r/g/b
+         vst3_u8(out, o);
+         out += 8*3;
+      }
+   }
    if (step == 4) {
       // this is a fairly straightforward implementation and not super-optimized.
       uint8x8_t signflip = vdup_n_u8(0x80);
@@ -3792,6 +4063,104 @@ static void stbi__YCbCr_to_RGB_simd(stbi_uc *out, stbi_uc const *y, stbi_uc cons
    }
 #endif
 
+#ifdef STBI_RVV
+   if (step == 3) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      const unsigned char signflip = 0x80;
+      const short cr_const0 =    (short) ( 1.40200f*4096.0f+0.5f);
+      const short cr_const1 =  - (short) ( 0.71414f*4096.0f+0.5f);
+      const short cb_const0 =  - (short) ( 0.34414f*4096.0f+0.5f);
+      const short cb_const1 =    (short) ( 1.77200f*4096.0f+0.5f);
+
+      int n = count;
+      while (n > 0) {
+         size_t vl = vsetvl_e8m2(n);
+
+         // load
+         vuint8m2_t y_bytes  = vle8_v_u8m2(y + i, vl);
+         vuint8m2_t cr_bytes = vle8_v_u8m2(pcr + i, vl);
+         vuint8m2_t cb_bytes = vle8_v_u8m2(pcb + i, vl);
+         vint8m2_t cr_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cr_bytes, signflip, vl));
+         vint8m2_t cb_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cb_bytes, signflip, vl));
+
+         // expand to s16
+         vint16m4_t yws = vadd_vx_i16m4(vsll_vx_i16m4(vreinterpret_v_u16m4_i16m4(vwcvtu_x_x_v_u16m4(y_bytes, vl)), 4, vl), 8, vl);
+         vint16m4_t crw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cr_biased, vl), 8, vl);
+         vint16m4_t cbw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cb_biased, vl), 8, vl);
+
+         // color transform
+         vint16m4_t cr0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cb0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cr1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const1, vl), 16, vl), 0, vl);
+         vint16m4_t cb1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const1, vl), 16, vl), 0, vl);
+         vint16m4_t rws = vadd_vv_i16m4(yws, cr0, vl);
+         vint16m4_t gws = vadd_vv_i16m4(vadd_vv_i16m4(yws, cb0, vl), cr1, vl);
+         vint16m4_t bws = vadd_vv_i16m4(yws, cb1, vl);
+
+         // undo scaling, round, convert to byte
+         vuint8m2_t rb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(rws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t gb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(gws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t bb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(bws, 4, vl), 0, vl)), 0, vl);
+
+         // store, interleaving r/g/b
+         vuint8m2x3_t o = vcreate_u8m2x3(rb, gb, bb);
+         vsseg3e8_v_u8m2x3(out, o, vl);
+         out += vl*3;
+
+         i += vl;
+         n -= vl;
+      }
+   }
+   if (step == 4) {
+      // this is a fairly straightforward implementation and not super-optimized.
+      const unsigned char signflip = 128;
+      const short cr_const0 =    (short) ( 1.40200f*4096.0f+0.5f);
+      const short cr_const1 =  - (short) ( 0.71414f*4096.0f+0.5f);
+      const short cb_const0 =  - (short) ( 0.34414f*4096.0f+0.5f);
+      const short cb_const1 =    (short) ( 1.77200f*4096.0f+0.5f);
+
+      int n = count;
+      while (n > 0) {
+         size_t vl = vsetvl_e8m1(n);
+
+         // load
+         vuint8m2_t y_bytes  = vle8_v_u8m2(y + i, vl);
+         vuint8m2_t cr_bytes = vle8_v_u8m2(pcr + i, vl);
+         vuint8m2_t cb_bytes = vle8_v_u8m2(pcb + i, vl);
+         vint8m2_t cr_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cr_bytes, signflip, vl));
+         vint8m2_t cb_biased = vreinterpret_v_u8m2_i8m2(vsub_vx_u8m2(cb_bytes, signflip, vl));
+
+         // expand to s16
+         vint16m4_t yws = vadd_vx_i16m4(vsll_vx_i16m4(vreinterpret_v_u16m4_i16m4(vwcvtu_x_x_v_u16m4(y_bytes, vl)), 4, vl), 8, vl);
+         vint16m4_t crw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cr_biased, vl), 8, vl);
+         vint16m4_t cbw = vsll_vx_i16m4(vwcvt_x_x_v_i16m4(cb_biased, vl), 8, vl);
+
+         // color transform
+         vint16m4_t cr0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cb0 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const0, vl), 16, vl), 0, vl);
+         vint16m4_t cr1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(crw, cr_const1, vl), 16, vl), 0, vl);
+         vint16m4_t cb1 = vnclip_wx_i16m4(vsra_vx_i32m8(vwmul_vx_i32m8(cbw, cb_const1, vl), 16, vl), 0, vl);
+         vint16m4_t rws = vadd_vv_i16m4(yws, cr0, vl);
+         vint16m4_t gws = vadd_vv_i16m4(vadd_vv_i16m4(yws, cb0, vl), cr1, vl);
+         vint16m4_t bws = vadd_vv_i16m4(yws, cb1, vl);
+
+         // undo scaling, round, convert to byte
+         vuint8m2_t rb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(rws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t gb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(gws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t bb = vnclipu_wx_u8m2(vreinterpret_v_i16m4_u16m4(vmax_vx_i16m4(vsra_vx_i16m4(bws, 4, vl), 0, vl)), 0, vl);
+         vuint8m2_t ab = vmv_v_x_u8m2(255, vl);
+
+         // store, interleaving r/g/b/a
+         vuint8m2x4_t o = vcreate_u8m2x4(rb, gb, bb, ab);
+         vsseg4e8_v_u8m2x4(out, o, vl);
+         out += vl*4;
+
+         i += vl;
+         n -= vl;
+      }
+   }
+#endif
+
    for (; i < count; ++i) {
       int y_fixed = (y[i] << 20) + (1<<19); // rounding
       int r,g,b;
@@ -3835,6 +4204,12 @@ static void stbi__setup_jpeg(stbi__jpeg *j)
    j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
    j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
 #endif
+
+#ifdef STBI_RVV
+   j->idct_block_kernel = stbi__idct_simd;
+   j->YCbCr_to_RGB_kernel = stbi__YCbCr_to_RGB_simd;
+   j->resample_row_hv_2_kernel = stbi__resample_row_hv_2_simd;
+#endif
 }
 
 // clean up the temporary component buffers