From 8094474e59f59d3494c454524caf5ab9f354083c Mon Sep 17 00:00:00 2001
From: Larry Gritz <lg@larrygritz.com>
Date: Sat, 21 Sep 2024 14:09:35 -0700
Subject: [PATCH] perf: Reduce compile time by trimming template expansion in
 IBA

I was profiling the builds and saw that modules with lots of template
expansion dominate the compile time. For example,
imagebufalgo_pixelmath.cpp alone took 290s to compile on my 2020
MacbookPro (!), and imagebufalgo_addsub.cpp took 84 seconds.

This is all due to the combnatorics of expanding IBA templates via the
DISPATCH macros in imagebufalgo_util.h separately for every type that
the arguments can be. But I claim that most combinations are rarely if
ever used. I mean, how often does anybody need IBA::add() to add an
int8 image to an int16 image? So this PR rewrites those macros to
simplify the cases as follows:

* The common pixel data types are float, half, unint8, and uint16.

* Specialized versions are fully expanded only when the result and
  input images are one of these types. Images not of one of those
  types are first automatically converted to float to make them reduce
  to a common case. That makes uncommon pixel data types like (signed)
  int16 not expand the template, but rather convert to and from float
  and use the float specializations.

* For binary and ternary operations (those with 2 or 3 image inputs),
  if the pixel types of the inputs doesn't match, we make sure they
  both are converted to float. So, for example, we don't need a
  specialized version that adds a half image to a uint16 image -- just
  convert them to float and use the common case. But we do specalize
  if the two inputs are both the same common case, such as adding two
  uint16 images.

* Assume that commonly, the result image will either be float, or will
  be the same pixel data type as the inputs. Other combinations trigger
  assignment to a temporary float IB, then copying with convertion to
  the uncommon use-supplied result buffer.

* Additionally, we gut down on a little bit more templating by moving
  some "deep" methods from the type-templated ImageBuf::Iterator to its
  type-generic non-templated base class IteraterBase.

The net result of all this is an awful lot less template expansion.
With this in place, my laptop compiles imagebufalgo_pixelmath.cpp
in 97s (vs 290 before) and imagebufalgo_addsub.cpp in 26s (from 84).
It takes a big bite out of all the iba files, and reduces project-wide
compile time by over 10%, around 30s out of 300 for a fresh, uncached,
optimized build with 16 threads.

Signed-off-by: Larry Gritz <lg@larrygritz.com>
---
 src/include/OpenImageIO/imagebuf.h          |  48 +--
 src/include/OpenImageIO/imagebufalgo_util.h | 431 +++++++++++++-------
 2 files changed, 305 insertions(+), 174 deletions(-)

diff --git a/src/include/OpenImageIO/imagebuf.h b/src/include/OpenImageIO/imagebuf.h
index e209dba08e..4fd4a4db30 100644
--- a/src/include/OpenImageIO/imagebuf.h
+++ b/src/include/OpenImageIO/imagebuf.h
@@ -1604,6 +1604,30 @@ class OIIO_API ImageBuf {
                                  m_nchannels);
         }
 
+        /// Set the number of deep data samples at this pixel. (Only use
+        /// this if deep_alloc() has not yet been called on the buffer.)
+        void set_deep_samples(int n)
+        {
+            ensure_writable();
+            return const_cast<ImageBuf*>(m_ib)->set_deep_samples(m_x, m_y, m_z,
+                                                                 n);
+        }
+
+        /// Set the deep data value of sample s of channel c. (Only use this
+        /// if deep_alloc() has been called.)
+        void set_deep_value(int c, int s, float value)
+        {
+            ensure_writable();
+            return const_cast<ImageBuf*>(m_ib)->set_deep_value(m_x, m_y, m_z, c,
+                                                               s, value);
+        }
+        void set_deep_value(int c, int s, uint32_t value)
+        {
+            ensure_writable();
+            return const_cast<ImageBuf*>(m_ib)->set_deep_value(m_x, m_y, m_z, c,
+                                                               s, value);
+        }
+
     protected:
         friend class ImageBuf;
         friend class ImageBufImpl;
@@ -1799,30 +1823,6 @@ class OIIO_API ImageBuf {
                                  TypeDesc::BASETYPE(m_pixeltype), m_proxydata,
                                  m_nchannels);
         }
-
-        /// Set the number of deep data samples at this pixel. (Only use
-        /// this if deep_alloc() has not yet been called on the buffer.)
-        void set_deep_samples(int n)
-        {
-            ensure_writable();
-            return const_cast<ImageBuf*>(m_ib)->set_deep_samples(m_x, m_y, m_z,
-                                                                 n);
-        }
-
-        /// Set the deep data value of sample s of channel c. (Only use this
-        /// if deep_alloc() has been called.)
-        void set_deep_value(int c, int s, float value)
-        {
-            ensure_writable();
-            return const_cast<ImageBuf*>(m_ib)->set_deep_value(m_x, m_y, m_z, c,
-                                                               s, value);
-        }
-        void set_deep_value(int c, int s, uint32_t value)
-        {
-            ensure_writable();
-            return const_cast<ImageBuf*>(m_ib)->set_deep_value(m_x, m_y, m_z, c,
-                                                               s, value);
-        }
     };
 
 
diff --git a/src/include/OpenImageIO/imagebufalgo_util.h b/src/include/OpenImageIO/imagebufalgo_util.h
index 97924cc936..afbc944208 100644
--- a/src/include/OpenImageIO/imagebufalgo_util.h
+++ b/src/include/OpenImageIO/imagebufalgo_util.h
@@ -275,8 +275,16 @@ inline TypeDesc type_merge (TypeDesc a, TypeDesc b, TypeDesc c)
 }
 
 
+inline bool is_common_pixel_type(TypeDesc t)
+{
+    return t.basetype == TypeDesc::FLOAT || t.basetype == TypeDesc::UINT8 
+        || t.basetype == TypeDesc::UINT16 || t.basetype == TypeDesc::HALF;
+}
+
+
 
 // Macro to call a type-specialized version func<type>(R,...)
+// instantiated for EVERY pixel type we support.
 #define OIIO_DISPATCH_TYPES(ret,name,func,type,R,...)                   \
     switch (type.basetype) {                                            \
     case TypeDesc::FLOAT :                                              \
@@ -329,6 +337,8 @@ inline TypeDesc type_merge (TypeDesc a, TypeDesc b, TypeDesc c)
     }
 
 // Macro to call a type-specialized version func<Rtype,Atype>(R,...).
+// instantiated for EVERY combination of pixel type we support.
+// Note: The combinatorics here are brutal! Only do this if necessary.
 #define OIIO_DISPATCH_TYPES2(ret,name,func,Rtype,Atype,R,...)           \
     switch (Rtype.basetype) {                                           \
     case TypeDesc::FLOAT :                                              \
@@ -380,10 +390,10 @@ inline TypeDesc type_merge (TypeDesc a, TypeDesc b, TypeDesc c)
         /* other types: punt and convert to float, then copy back */    \
         ImageBuf Rtmp;                                                  \
         if ((R).initialized())                                          \
-            Rtmp.copy (R, TypeDesc::FLOAT);                             \
+            Rtmp.copy(R, TypeFloat);                                    \
         ret = func<float> (Rtmp, __VA_ARGS__);                          \
         if (ret)                                                        \
-            (R).copy (Rtmp);                                            \
+            (R).copy(Rtmp, (R).pixeltype());                            \
         else                                                            \
             (R).errorfmt("{}", Rtmp.geterror());                        \
         }                                                               \
@@ -400,101 +410,199 @@ inline TypeDesc type_merge (TypeDesc a, TypeDesc b, TypeDesc c)
         ret = func<Rtype,half> (R, A, __VA_ARGS__); break;              \
     case TypeDesc::UINT16:                                              \
         ret = func<Rtype,unsigned short> (R, A, __VA_ARGS__); break;    \
-    default: {                                                          \
-        /* other types: punt and convert to float, then copy back */    \
-        ImageBuf Atmp;                                                  \
-        Atmp.copy (A, TypeDesc::FLOAT);                                 \
-        ret = func<Rtype,float> (R, Atmp, __VA_ARGS__);                 \
-        }                                                               \
+    default:                                                            \
+        OIIO_DASSERT(0 && "bad type");                                  \
+        ret = false;                                                    \
+        break;                                                          \
     }
 
 // Macro to call a type-specialized version func<Rtype,Atype>(R,A,...) for
-// the most common types, and even for uncommon types when src and dst types
-// are identical. It will auto-convert remaining rare cases to float.
-#define OIIO_DISPATCH_COMMON_TYPES2(ret,name,func,Rtype,Atype,R,A,...)  \
-    if (Rtype == Atype) {                                               \
-        /* All data types, when Rtype == Atype */                       \
-        switch (Atype.basetype) {                                       \
-        case TypeDesc::FLOAT :                                          \
-            ret = func<float,float> (R, A, __VA_ARGS__); break;         \
-        case TypeDesc::UINT8 :                                          \
-            ret = func<uint8_t,uint8_t> (R, A, __VA_ARGS__); break;     \
-        case TypeDesc::UINT16:                                          \
-            ret = func<uint16_t,uint16_t> (R, A, __VA_ARGS__); break;   \
-        case TypeDesc::HALF  :                                          \
-            ret = func<half,half> (R, A, __VA_ARGS__); break;           \
-        case TypeDesc::INT8 :                                           \
-            ret = func<char,char> (R, A, __VA_ARGS__); break;           \
-        case TypeDesc::INT16 :                                          \
-            ret = func<short,short> (R, A, __VA_ARGS__); break;         \
-        case TypeDesc::UINT :                                           \
-            ret = func<uint32_t,uint32_t> (R, A, __VA_ARGS__); break;   \
-        case TypeDesc::INT :                                            \
-            ret = func<int,int> (R, A, __VA_ARGS__); break;             \
-        case TypeDesc::DOUBLE :                                         \
-            ret = func<double,double> (R, A, __VA_ARGS__); break;       \
-        default:                                                        \
-            (R).errorfmt("{}: Unsupported pixel data format '{}'", name, Atype); \
-            ret = false;                                                \
+// the most common types. It will auto-convert remaining rare cases to float.
+#define OIIO_DISPATCH_COMMON_TYPES2_FULL(ret, name, func, Rtype_, Atype_, R, A, ...)\
+    {                                                                          \
+        OIIO_ASSERT((R).initialized());                                        \
+        ret       = false;                                                     \
+        auto Rptr = &R;                                                        \
+        auto Aptr = &A;                                                        \
+        ImageBuf Rtmp, Atmp;                                                   \
+        TypeDesc Rtype = Rtype_, Atype = Atype_;                               \
+        if (!ImageBufAlgo::is_common_pixel_type(Atype)) {                      \
+            Atype = TypeDesc::FLOAT;                                           \
+            Atmp.copy(A, Atype);                                               \
+            Aptr = &Atmp;                                                      \
+        }                                                                      \
+        if (!ImageBufAlgo::is_common_pixel_type(Rtype)) {                      \
+            Rtype = TypeDesc::FLOAT;                                           \
+            Rtmp.copy(R, Rtype);                                               \
+            Rptr = &Rtmp;                                                      \
+        }                                                                      \
+        if (Rtype == TypeFloat) {                                              \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret, name, func, float, Atype,    \
+                                             *Rptr, *Aptr, __VA_ARGS__);       \
+        } else if (Rtype == TypeUInt8) {                                       \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret, name, func, uint8_t, Atype,  \
+                                             *Rptr, *Aptr, __VA_ARGS__);       \
+        } else if (Rtype == TypeHalf) {                                        \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret, name, func, half, Atype,     \
+                                             *Rptr, *Aptr, __VA_ARGS__);       \
+        } else if (Rtype == TypeUInt16) {                                      \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret, name, func, uint16_t, Atype, \
+                                             *Rptr, *Aptr, __VA_ARGS__);       \
+        } else {                                                               \
+            OIIO_DASSERT(0 && "bad type");                                     \
+        }                                                                      \
+        if (Rptr != &(R)) {                                                    \
+            /* If we used Rtmp, copy pixels or errors to the real R */         \
+            if (ret)                                                           \
+                (R).copy(*Rptr, (R).pixeltype());                              \
+            else                                                               \
+                (R).errorfmt("{}", Rptr->geterror());                          \
+        }                                                                      \
+    }
+
+
+// Macro to call a type-specialized version func<Rtype,Atype>(R,A,...) for
+// the most common types. It will auto-convert remaining rare cases to float.
+#define OIIO_DISPATCH_COMMON_TYPES2(ret, name, func, Rtype_, Atype_, R, A, ...)\
+    {                                                                          \
+        OIIO_ASSERT((R).initialized());                                        \
+        ret       = false;                                                     \
+        auto Rptr = &R;                                                        \
+        auto Aptr = &A;                                                        \
+        ImageBuf Rtmp, Atmp;                                                   \
+        TypeDesc Rtype = Rtype_, Atype = Atype_;                               \
+        if (!ImageBufAlgo::is_common_pixel_type(Atype)) {                      \
+            Atype = TypeDesc::FLOAT;                                           \
+            Atmp.copy(A, Atype);                                               \
+            Aptr = &Atmp;                                                      \
+        }                                                                      \
+        if (!ImageBufAlgo::is_common_pixel_type(Rtype) || (Rtype != Atype && Rtype != TypeFloat)) {  \
+            Rtype = TypeDesc::FLOAT;                                           \
+            Rtmp.copy(R, Rtype);                                               \
+            Rptr = &Rtmp;                                                      \
+        }                                                                      \
+        OIIO_ASSERT(Rtype == TypeFloat || Rtype == Atype);                     \
+        if (Rtype == TypeFloat) {                                              \
+            switch (Atype.basetype) {                                          \
+            case TypeDesc::FLOAT :                                             \
+                ret = func<float,float> (*Rptr, *Aptr, __VA_ARGS__); break;            \
+            case TypeDesc::UINT8 :                                             \
+                ret = func<float,unsigned char> (*Rptr, *Aptr, __VA_ARGS__); break;    \
+            case TypeDesc::HALF  :                                             \
+                ret = func<float,half> (*Rptr, *Aptr, __VA_ARGS__); break;             \
+            case TypeDesc::UINT16:                                             \
+                ret = func<float,unsigned short> (*Rptr, *Aptr, __VA_ARGS__); break;   \
+            default:                                                           \
+                OIIO_DASSERT(0 && "bad type");                                 \
+                ret = false;                                                   \
+                break;                                                         \
+            }                                                                  \
+        } else if (Rtype == TypeUInt8) {                                       \
+            switch (Atype.basetype) {                                          \
+            case TypeDesc::FLOAT :                                             \
+                ret = func<uint8_t,float> (*Rptr, *Aptr, __VA_ARGS__); break;            \
+            case TypeDesc::UINT8 :                                             \
+                ret = func<uint8_t,unsigned char> (*Rptr, *Aptr, __VA_ARGS__); break;    \
+            /* case TypeDesc::HALF  :*/                                           \
+            /*     ret = func<uint8_t,half> (*Rptr, *Aptr, __VA_ARGS__); break;*/           \
+            /* case TypeDesc::UINT16:*/                                           \
+            /*     ret = func<uint8_t,unsigned short> (*Rptr, *Aptr, __VA_ARGS__); break;*/ \
+            default:                                                           \
+                OIIO_DASSERT(0 && "bad type");                                 \
+                ret = false;                                                   \
+                break;                                                         \
+            }                                                                  \
+        } else if (Rtype == TypeHalf) {                                        \
+            switch (Atype.basetype) {                                          \
+            case TypeDesc::FLOAT :                                             \
+                ret = func<half,float> (*Rptr, *Aptr, __VA_ARGS__); break;            \
+            /* case TypeDesc::UINT8 : */                                            \
+            /*     ret = func<half,unsigned char> (*Rptr, *Aptr, __VA_ARGS__); break; */   \
+            case TypeDesc::HALF  :                                             \
+                ret = func<half,half> (*Rptr, *Aptr, __VA_ARGS__); break;             \
+            /* case TypeDesc::UINT16: */                                            \
+            /*     ret = func<half,unsigned short> (*Rptr, *Aptr, __VA_ARGS__); break; */  \
+            default:                                                           \
+                OIIO_DASSERT(0 && "bad type");                                 \
+                ret = false;                                                   \
+                break;                                                         \
+            }                                                                  \
+        } else if (Rtype == TypeUInt16) {                                      \
+            switch (Atype.basetype) {                                          \
+            case TypeDesc::FLOAT :                                             \
+                ret = func<uint16_t,float> (*Rptr, *Aptr, __VA_ARGS__); break;            \
+            /*case TypeDesc::UINT8 : */                                          \
+            /*    ret = func<uint16_t,unsigned char> (*Rptr, *Aptr, __VA_ARGS__); break; */ \
+            /*case TypeDesc::HALF  : */                                          \
+            /*    ret = func<uint16_t,half> (*Rptr, *Aptr, __VA_ARGS__); break; */          \
+            case TypeDesc::UINT16:                                             \
+                ret = func<uint16_t,unsigned short> (*Rptr, *Aptr, __VA_ARGS__); break;   \
+            default:                                                           \
+                OIIO_DASSERT(0 && "bad type");                                 \
+                ret = false;                                                   \
+                break;                                                         \
+            }                                                                  \
+        } else {                                                               \
+            OIIO_DASSERT(0 && "bad type");                                     \
+        }                                                                      \
+        if (Rptr != &(R)) {                                                    \
+            /* If we used Rtmp, copy pixels or errors to the real R */         \
+            if (ret)                                                           \
+                (R).copy(*Rptr, (R).pixeltype());                              \
+            else                                                               \
+                (R).errorfmt("{}", Rptr->geterror());                          \
+        }                                                                      \
+    }
+
+
+// Macro to call a type-specialized version func<Rtype,Atype>(R,A,...) for
+// the most common types. It will auto-convert other cases to/from float.
+// This is the case for when we don't actually write to the read-only R image.
+#define OIIO_DISPATCH_COMMON_TYPES2_CONST(ret,name,func,Rtype_,Atype_,R,A,...)  \
+    {                                                                   \
+        OIIO_ASSERT((R).initialized());                                 \
+        ret = false;                                                    \
+        auto Rptr = &R;                                                 \
+        auto Aptr = &A;                                                 \
+        ImageBuf Rtmp, Atmp;                                            \
+        TypeDesc Rtype = Rtype_, Atype = Atype_;                        \
+        if (!ImageBufAlgo::is_common_pixel_type(Rtype)) {               \
+            Rtype = TypeDesc::FLOAT;                                    \
+            Rtmp.copy(R, Rtype);                                        \
+            Rptr = &Rtmp;                                               \
+        }                                                               \
+        if (!ImageBufAlgo::is_common_pixel_type(Atype)) {               \
+            Atype = TypeDesc::FLOAT;                                    \
+            Atmp.copy(A, Atype);                                        \
+            Aptr = &Atmp;                                               \
         }                                                               \
-    } else {                                                            \
-        /* When Rtype != Atype, handle the common pairs */              \
         switch (Rtype.basetype) {                                       \
         case TypeDesc::FLOAT :                                          \
-            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,float,Atype,R,A,__VA_ARGS__); \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,float,Atype,*Rptr,*Aptr, __VA_ARGS__); \
             break;                                                      \
         case TypeDesc::UINT8 :                                          \
-            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,uint8_t,Atype,R,A,__VA_ARGS__); \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,uint8_t,Atype,*Rptr,*Aptr, __VA_ARGS__); \
             break;                                                      \
         case TypeDesc::HALF  :                                          \
-            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,half,Atype,R,A,__VA_ARGS__); \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,half,Atype,*Rptr,*Aptr, __VA_ARGS__); \
             break;                                                      \
         case TypeDesc::UINT16:                                          \
-            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,uint16_t,Atype,R,A,__VA_ARGS__); \
+            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,uint16_t,Atype,*Rptr,*Aptr, __VA_ARGS__); \
+            break;                                                      \
+        default:                                                        \
+            OIIO_DASSERT(0 && "bad type");                              \
             break;                                                      \
-        default: {                                                      \
-            /* other combinations: convert to float, then copy back */  \
-            ImageBuf Rtmp;                                              \
-            if ((R).initialized())                                      \
-                Rtmp.copy (R, TypeDesc::FLOAT);                         \
-            OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,float,Atype,Rtmp,A,__VA_ARGS__); \
-            if (ret)                                                    \
-                (R).copy (Rtmp);                                        \
-            else                                                        \
-                (R).errorfmt("{}", Rtmp.geterror());                    \
-            }                                                           \
         }                                                               \
     }
 
 
-// Macro to call a type-specialized version func<Rtype,Atype>(R,A,...) for
-// the most common types. It will auto-convert other cases to/from float.
-// This is the case for when we don't actually write to the read-only R image.
-#define OIIO_DISPATCH_COMMON_TYPES2_CONST(ret,name,func,Rtype,Atype,R,A,...)  \
-    switch (Rtype.basetype) {                                           \
-    case TypeDesc::FLOAT :                                              \
-        OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,float,Atype,R,A,__VA_ARGS__); \
-        break;                                                          \
-    case TypeDesc::UINT8 :                                              \
-        OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,unsigned char,Atype,R,A,__VA_ARGS__); \
-        break;                                                          \
-    case TypeDesc::HALF  :                                              \
-        OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,half,Atype,R,A,__VA_ARGS__); \
-        break;                                                          \
-    case TypeDesc::UINT16:                                              \
-        OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,unsigned short,Atype,R,A,__VA_ARGS__); \
-        break;                                                          \
-    default: {                                                          \
-        /* other types: punt and convert to float, then copy back */    \
-        ImageBuf Rtmp;                                                  \
-        if ((R).initialized())                                          \
-            Rtmp.copy (R, TypeDesc::FLOAT);                             \
-        OIIO_DISPATCH_COMMON_TYPES2_HELP(ret,name,func,float,Atype,Rtmp,A,__VA_ARGS__); \
-    } }
-
-
 // Helper, do not call from the outside world.
-#define OIIO_DISPATCH_COMMON_TYPES3_HELP2(ret,name,func,Rtype,Atype,Btype,R,A,B,...) \
+// This is the the case where we're switching based on the result. Be very
+// stingy about the types directly supported here, for all others, write into
+// a temp float buffer and copy back. Remember that if the result had the same
+// type as both inputs, we wouldn't be here all, so this is already an
+// inefficient case.
+#define OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,Atype,Btype,R,A,B,...) \
     switch (Rtype.basetype) {                                           \
     case TypeDesc::FLOAT :                                              \
         ret = func<float,Atype,Btype> (R,A,B,__VA_ARGS__); break;       \
@@ -504,91 +612,114 @@ inline TypeDesc type_merge (TypeDesc a, TypeDesc b, TypeDesc c)
         ret = func<half,Atype,Btype> (R,A,B,__VA_ARGS__); break;        \
     case TypeDesc::UINT16:                                              \
         ret = func<unsigned short,Atype,Btype> (R,A,B,__VA_ARGS__); break;  \
-    default: {                                                          \
-        /* other types: punt and convert to float, then copy back */    \
-        ImageBuf Rtmp;                                                  \
-        if ((R).initialized())                                          \
-            Rtmp.copy (R, TypeDesc::FLOAT);                             \
-        ret = func<float,Atype,Btype> (R,A,B,__VA_ARGS__);              \
-        if (ret)                                                        \
-            (R).copy (Rtmp);                                            \
-        else                                                            \
-            (R).errorfmt("{}", Rtmp.geterror());                        \
-        }                                                               \
-    }
-
-// Helper, do not call from the outside world.
-#define OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,Atype,Btype,R,A,B,...) \
-    switch (Btype.basetype) {                                           \
-    case TypeDesc::FLOAT :                                              \
-        OIIO_DISPATCH_COMMON_TYPES3_HELP2(ret,name,func,Rtype,Atype,float,R,A,B,__VA_ARGS__); \
-        break;                                                          \
-    case TypeDesc::UINT8 :                                              \
-        OIIO_DISPATCH_COMMON_TYPES3_HELP2(ret,name,func,Rtype,Atype,unsigned char,R,A,B,__VA_ARGS__); \
-        break;                                                          \
-    case TypeDesc::HALF :                                               \
-        OIIO_DISPATCH_COMMON_TYPES3_HELP2(ret,name,func,Rtype,Atype,half,R,A,B,__VA_ARGS__); \
-        break;                                                          \
-    case TypeDesc::UINT16 :                                             \
-        OIIO_DISPATCH_COMMON_TYPES3_HELP2(ret,name,func,Rtype,Atype,unsigned short,R,A,B,__VA_ARGS__); \
+    default:                                                            \
+        OIIO_DASSERT(0 && "bad type");                                  \
         break;                                                          \
-    default: {                                                          \
-        /* other types: punt and convert to float */                    \
-        ImageBuf Btmp;                                                  \
-        Btmp.copy (B, TypeDesc::FLOAT);                                 \
-        OIIO_DISPATCH_COMMON_TYPES3_HELP2(ret,name,func,Rtype,Atype,float,R,A,Btmp,__VA_ARGS__); \
-        }                                                               \
     }
 
 // Macro to call a type-specialized version func<Rtype,Atype,Btype>(R,A,B,...)
-// the most common types, and for all types when all three images have
-// the same type. Remaining rare cases will auto-convert to float.
-#define OIIO_DISPATCH_COMMON_TYPES3(ret,name,func,Rtype,Atype,Btype,R,A,B,...)  \
-    if (Rtype == Atype && Rtype == Btype) {                             \
-        /* All data types, when Rtype == Atype */                       \
+// the most common type combination. Remaining rare cases will auto-convert to
+// float. Specifically: anything not {float,half,uint8,uint16} is converted
+// to float; if A and B's types don't match, they are converted to float; and
+// if R doesn't either match A and B or is float, then A and B are also
+#define OIIO_DISPATCH_COMMON_TYPES3(ret,name,func,Rtype_,Atype_,Btype_,R,A,B,...)  \
+    {                                                                          \
+        OIIO_ASSERT((R).initialized());                                        \
+        ret       = false;                                                     \
+        auto Rptr = &R;                                                        \
+        auto Aptr = &A;                                                        \
+        auto Bptr = &B;                                                        \
+        ImageBuf Rtmp, Atmp, Btmp;                                             \
+        TypeDesc Rtype = Rtype_, Atype = Atype_, Btype = Btype_;               \
+        if (!ImageBufAlgo::is_common_pixel_type(Atype) || (Atype != Btype && Atype != TypeFloat)) {  \
+            Atype = TypeDesc::FLOAT;                                           \
+            Atmp.copy(A, Atype);                                               \
+            Aptr = &Atmp;                                                      \
+        }                                                                      \
+        if (!ImageBufAlgo::is_common_pixel_type(Btype) || (Atype != Btype && Btype != TypeFloat)) {  \
+            Btype = TypeDesc::FLOAT;                                           \
+            Btmp.copy(B, Btype);                                               \
+            Bptr = &Btmp;                                                      \
+        }                                                                      \
+        OIIO_ASSERT(Atype == Btype);                                           \
+        if (!ImageBufAlgo::is_common_pixel_type(Rtype) || (Rtype != Atype && Rtype != TypeFloat)) {  \
+            Rtype = TypeDesc::FLOAT;                                           \
+            Rtmp.copy(R, Rtype);                                               \
+            Rptr = &Rtmp;                                                      \
+        }                                                                      \
+        OIIO_ASSERT(Rtype == Atype || Rtype == TypeFloat);                     \
         switch (Atype.basetype) {                                       \
         case TypeDesc::FLOAT :                                          \
-            ret = func<float,float,float> (R, A, B, __VA_ARGS__); break;\
-        case TypeDesc::UINT8 :                                          \
-            ret = func<uint8_t,uint8_t,uint8_t> (R, A, B, __VA_ARGS__); break; \
-        case TypeDesc::UINT16:                                          \
-            ret = func<uint16_t,uint16_t,uint16_t> (R, A, B, __VA_ARGS__); break; \
-        case TypeDesc::HALF  :                                          \
-            ret = func<half,half,half> (R, A, B, __VA_ARGS__); break;   \
-        case TypeDesc::INT8 :                                           \
-            ret = func<char,char,char> (R, A, B, __VA_ARGS__); break;   \
-        case TypeDesc::INT16 :                                          \
-            ret = func<int16_t,int16_t,int16_t> (R, A, B, __VA_ARGS__); break; \
-        case TypeDesc::UINT :                                           \
-            ret = func<uint32_t,uint32_t,uint32_t> (R, A, B, __VA_ARGS__); break; \
-        case TypeDesc::INT :                                            \
-            ret = func<int,int,int> (R, A, B, __VA_ARGS__); break;      \
-        case TypeDesc::DOUBLE :                                         \
-            ret = func<double,double,double> (R, A, B, __VA_ARGS__); break; \
-        default:                                                        \
-            (R).errorfmt("{}: Unsupported pixel data format '{}'", name, Atype); \
-            ret = false;                                                \
-        }                                                               \
-    } else {                                                            \
-        switch (Atype.basetype) {                                       \
-        case TypeDesc::FLOAT :                                          \
-            OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,float,Btype,R,A,B,__VA_ARGS__); \
+            switch (Rtype.basetype) {                                           \
+            case TypeDesc::FLOAT :                                              \
+                ret = func<float,float,float>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;       \
+            /*case TypeDesc::UINT8 :*/                                              \
+            /*    ret = func<unsigned char,float,float>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/  \
+            /*case TypeDesc::HALF  :*/                                              \
+            /*    ret = func<half,float,float>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/        \
+            /*case TypeDesc::UINT16:*/                                              \
+            /*    ret = func<unsigned short,float,float>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/  \
+            default:                                                            \
+                OIIO_ASSERT(0 && "bad type");                                  \
+                break;                                                          \
+            }                                                           \
             break;                                                      \
         case TypeDesc::UINT8 :                                          \
-            OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,unsigned char,Btype,R,A,B,__VA_ARGS__); \
+            switch (Rtype.basetype) {                                           \
+            case TypeDesc::FLOAT :                                              \
+                ret = func<float,uint8_t,uint8_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;       \
+            case TypeDesc::UINT8 :                                              \
+                ret = func<unsigned char,uint8_t,uint8_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;  \
+            /*case TypeDesc::HALF  :*/                                              \
+            /*    ret = func<half,uint8_t,uint8_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/        \
+            /*case TypeDesc::UINT16:*/                                              \
+            /*    ret = func<unsigned short,uint8_t,uint8_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/  \
+            default:                                                            \
+                OIIO_ASSERT(0 && "bad type");                                  \
+                break;                                                          \
+            }                                                           \
             break;                                                      \
         case TypeDesc::HALF  :                                          \
-            OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,half,Btype,R,A,B,__VA_ARGS__); \
+            switch (Rtype.basetype) {                                           \
+            case TypeDesc::FLOAT :                                              \
+                ret = func<float,half,half>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;       \
+            /*case TypeDesc::UINT8 :*/                                              \
+            /*    ret = func<unsigned char,half,half>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/  \
+            case TypeDesc::HALF  :                                              \
+                ret = func<half,half,half>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;        \
+            /*case TypeDesc::UINT16: */                                             \
+            /*    ret = func<unsigned short,half,half>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;*/  \
+            default:                                                            \
+                OIIO_ASSERT(0 && "bad type");                                  \
+                break;                                                          \
+            }                                                           \
             break;                                                      \
         case TypeDesc::UINT16:                                          \
-            OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,unsigned short,Btype,R,A,B,__VA_ARGS__); \
+            switch (Rtype.basetype) {                                           \
+            case TypeDesc::FLOAT :                                              \
+                ret = func<float,uint16_t,uint16_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;       \
+            /*case TypeDesc::UINT8 :                                              \
+                ret = func<unsigned char,uint16_t,uint16_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break; */ \
+            /*case TypeDesc::HALF  :                                              \
+                ret = func<half,uint16_t,uint16_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break; */ \
+            case TypeDesc::UINT16:                                              \
+                ret = func<unsigned short,uint16_t,uint16_t>(*Rptr,*Aptr,*Bptr,__VA_ARGS__); break;  \
+            default:                                                            \
+                OIIO_ASSERT(0 && "bad type");                                  \
+                break;                                                          \
+            }                                                           \
             break;                                                      \
         default:                                                        \
-            /* other types: punt and convert to float */                \
-            ImageBuf Atmp;                                              \
-            Atmp.copy (A, TypeDesc::FLOAT);                             \
-            OIIO_DISPATCH_COMMON_TYPES3_HELP(ret,name,func,Rtype,float,Btype,R,Atmp,B,__VA_ARGS__); \
+            OIIO_ASSERT(0 && "bad type");                              \
+            break;                                                      \
         }                                                               \
+        if (Rptr != &(R)) {                                                    \
+            /* If we used Rtmp, copy pixels or errors to the real R */         \
+            if (ret)                                                           \
+                (R).copy(*Rptr, (R).pixeltype());                              \
+            else                                                               \
+                (R).errorfmt("{}", Rptr->geterror());                          \
+        }                                                                      \
     }
 
 
@@ -611,8 +742,8 @@ inline TypeDesc type_merge (TypeDesc a, TypeDesc b, TypeDesc c)
 #define IBA_FIX_PERCHAN_LEN_DEF(av,len)                                 \
     IBA_FIX_PERCHAN_LEN (av, len, 0.0f, av.size() ? av.back() : 0.0f);
 
-// clang-format on
 
+// clang-format on
 
 
 /// Simple image per-pixel unary operation: Given a source image `src`, return