neo-ai · trevor-m · Jun 16, 2020 · May 7, 2020 · May 7, 2020 · May 7, 2020
diff --git a/.gitignore b/.gitignore
@@ -2,10 +2,11 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+*.S
 # C extensions
 *.so
-
+*.ll
+.npm
 # Distribution / packaging
 .Python
 env/
@@ -224,7 +225,7 @@ Pipfile.lock
 # conda package artifacts
 conda/Dockerfile.cuda*
 conda/pkg
-
+.node_repl_history
 # nix files
 .envrc
 *.nix

diff --git a/3rdparty/bfloat16/bfloat16.cc b/3rdparty/bfloat16/bfloat16.cc
@@ -17,6 +17,7 @@
   ==============================================================================*/
 
 #include <tvm/runtime/c_runtime_api.h>
+
 #include <cstddef>
 #include <cstdint>
 
@@ -50,8 +51,7 @@ void BFloat16ToFloat(const uint16_t* src, float* dst, size_t size) {
 #endif
 }
 
-void BFloat16Add(const uint16_t* a, const uint16_t* b, uint16_t* dst,
-                 size_t size) {
+void BFloat16Add(const uint16_t* a, const uint16_t* b, uint16_t* dst, size_t size) {
   float a_f, b_f;
   BFloat16ToFloat(a, &a_f, 1);
   BFloat16ToFloat(b, &b_f, 1);

diff --git a/3rdparty/cma/cma.h b/3rdparty/cma/cma.h
@@ -27,20 +27,17 @@
 #ifndef VTA_DE10_NANO_KERNEL_MODULE_CMA_H_
 #define VTA_DE10_NANO_KERNEL_MODULE_CMA_H_
 
-
 /* Should be defined in settings.mk file */
 #ifndef CMA_IOCTL_MAGIC
-#define CMA_IOCTL_MAGIC  0xf2
+#define CMA_IOCTL_MAGIC 0xf2
 #endif
 
+#define CMA_ALLOC_CACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 1, 4)
+#define CMA_ALLOC_NONCACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 2, 4)
+#define CMA_FREE _IOC(_IOC_WRITE, CMA_IOCTL_MAGIC, 3, 4)
+#define CMA_GET_PHY_ADDR _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 4, 4)
+#define CMA_GET_SIZE _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 5, 4)
 
-#define CMA_ALLOC_CACHED      _IOC(_IOC_WRITE|_IOC_READ, CMA_IOCTL_MAGIC, 1, 4)
-#define CMA_ALLOC_NONCACHED   _IOC(_IOC_WRITE|_IOC_READ, CMA_IOCTL_MAGIC, 2, 4)
-#define CMA_FREE              _IOC(_IOC_WRITE,           CMA_IOCTL_MAGIC, 3, 4)
-#define CMA_GET_PHY_ADDR      _IOC(_IOC_WRITE|_IOC_READ, CMA_IOCTL_MAGIC, 4, 4)
-#define CMA_GET_SIZE          _IOC(_IOC_WRITE|_IOC_READ, CMA_IOCTL_MAGIC, 5, 4)
-
-#define CMA_IOCTL_MAXNR                 5
-
+#define CMA_IOCTL_MAXNR 5
 
 #endif  // VTA_DE10_NANO_KERNEL_MODULE_CMA_H_
diff --git a/3rdparty/cma/cma_api_impl.h b/3rdparty/cma/cma_api_impl.h
@@ -30,48 +30,47 @@
  * \brief Application layer implementation for contigous memory allocation.
  */
 
+#include <errno.h>
+#include <fcntl.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <fcntl.h>
-#include <unistd.h>
-#include <errno.h>
 #include <string.h>
-#include <sys/types.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
+#include <sys/types.h>
+#include <unistd.h>
 
 #include "cma_api.h"
 
 #ifndef CMA_IOCTL_MAGIC
-#define CMA_IOCTL_MAGIC       0xf2
+#define CMA_IOCTL_MAGIC 0xf2
 #endif
 
-#define CMA_ALLOC_CACHED      _IOC(_IOC_WRITE|_IOC_READ,  CMA_IOCTL_MAGIC, 1, 4)
-#define CMA_ALLOC_NONCACHED   _IOC(_IOC_WRITE|_IOC_READ,  CMA_IOCTL_MAGIC, 2, 4)
-#define CMA_FREE              _IOC(_IOC_WRITE,            CMA_IOCTL_MAGIC, 3, 4)
-#define CMA_GET_PHY_ADDR      _IOC(_IOC_WRITE|_IOC_READ,  CMA_IOCTL_MAGIC, 4, 4)
-#define CMA_GET_SIZE          _IOC(_IOC_WRITE|_IOC_READ,  CMA_IOCTL_MAGIC, 5, 4)
+#define CMA_ALLOC_CACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 1, 4)
+#define CMA_ALLOC_NONCACHED _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 2, 4)
+#define CMA_FREE _IOC(_IOC_WRITE, CMA_IOCTL_MAGIC, 3, 4)
+#define CMA_GET_PHY_ADDR _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 4, 4)
+#define CMA_GET_SIZE _IOC(_IOC_WRITE | _IOC_READ, CMA_IOCTL_MAGIC, 5, 4)
 
-#define CMA_IOCTL_MAXNR       5
+#define CMA_IOCTL_MAXNR 5
 
 #ifndef CMA_DEBUG
-  #define CMA_DEBUG           0
+#define CMA_DEBUG 0
 #endif
 #ifndef DRIVER_NODE_NAME
-  #define DRIVER_NODE_NAME    "cma"
+#define DRIVER_NODE_NAME "cma"
 #endif
 
 #if CMA_DEBUG == 1
-  #define __DEBUG(fmt, args...)  printf("CMA_API_DEBUG: " fmt, ##args)
+#define __DEBUG(fmt, args...) printf("CMA_API_DEBUG: " fmt, ##args)
 #else
-  #define __DEBUG(fmt, args...)
+#define __DEBUG(fmt, args...)
 #endif
 
-#define ROUND_UP(N, S)     ((((N) + (S) - 1) / (S)) * (S))
-
+#define ROUND_UP(N, S) ((((N) + (S)-1) / (S)) * (S))
 
 /* Private functions */
-void *cma_alloc(size_t size, unsigned ioctl_cmd);
+void* cma_alloc(size_t size, unsigned ioctl_cmd);
 
 /* Global file descriptor */
 int cma_fd = 0;
@@ -99,23 +98,19 @@ int cma_release(void) {
   return 0;
 }
 
-void *cma_alloc_cached(size_t size) {
-  return cma_alloc(size, CMA_ALLOC_CACHED);
-}
+void* cma_alloc_cached(size_t size) { return cma_alloc(size, CMA_ALLOC_CACHED); }
 
-void *cma_alloc_noncached(size_t size) {
-  return cma_alloc(size, CMA_ALLOC_NONCACHED);
-}
+void* cma_alloc_noncached(size_t size) { return cma_alloc(size, CMA_ALLOC_NONCACHED); }
 
-int cma_free(void *mem) {
+int cma_free(void* mem) {
   __DEBUG("Releasing contigous memory from 0x%x\n", (unsigned)mem);
   unsigned data, v_addr;
 
   /* save user space pointer value */
-  data   = (unsigned)mem;
+  data = (unsigned)mem;
   v_addr = (unsigned)mem;
 
-  if ( ioctl(cma_fd, CMA_GET_SIZE, &data) == -1 ) {
+  if (ioctl(cma_fd, CMA_GET_SIZE, &data) == -1) {
     __DEBUG("cma_free - ioctl command unsuccsessful - 0\n");
     return -1;
   }
@@ -125,23 +120,23 @@ int cma_free(void *mem) {
   munmap(mem, data);
 
   /* free cma entry */
-  if ( ioctl(cma_fd, CMA_FREE, &v_addr) == -1 ) {
+  if (ioctl(cma_fd, CMA_FREE, &v_addr) == -1) {
     __DEBUG("cma_free - ioctl command unsuccsessful - 1\n");
     return -1;
   }
 
   return 0;
 }
 
-unsigned cma_get_phy_addr(void *mem) {
+unsigned cma_get_phy_addr(void* mem) {
   unsigned data;
   __DEBUG("Getting physical address from 0x%x\n", (unsigned)mem);
 
   /* save user space pointer value */
   data = (unsigned)mem;
 
   /* get physical address */
-  if ( ioctl(cma_fd, CMA_GET_PHY_ADDR, &data) == -1 ) {
+  if (ioctl(cma_fd, CMA_GET_PHY_ADDR, &data) == -1) {
     __DEBUG("cma_free - ioctl command unsuccsessful\n");
     return 0;
   }
@@ -150,18 +145,17 @@ unsigned cma_get_phy_addr(void *mem) {
   return data;
 }
 
-
-void *cma_alloc(size_t size, unsigned ioctl_cmd) {
+void* cma_alloc(size_t size, unsigned ioctl_cmd) {
   unsigned data;
-  void   *mem;
+  void* mem;
   __DEBUG("Allocating 0x%x bytes of contigous memory\n", size);
 
   /* Page align size */
   size = ROUND_UP(size, getpagesize());
 
   /* ioctl cmd to allocate contigous memory */
   data = (unsigned)size;
-  if ( ioctl(cma_fd, ioctl_cmd, &data) == -1 ) {
+  if (ioctl(cma_fd, ioctl_cmd, &data) == -1) {
     __DEBUG("cma_alloc - ioctl command unsuccsessful\n");
     return NULL;
   }

diff --git a/3rdparty/compiler-rt/builtin_fp16.h b/3rdparty/compiler-rt/builtin_fp16.h
@@ -29,16 +29,33 @@ static inline uint32_t __clz(uint32_t x) {
   int n = 32;
   uint32_t y;
 
-  y = x >>16; if (y) { n = n -16; x = y; }
-  y = x >> 8; if (y) { n = n - 8; x = y; }
-  y = x >> 4; if (y) { n = n - 4; x = y; }
-  y = x >> 2; if (y) { n = n - 2; x = y; }
-  y = x >> 1; if (y) return n - 2;
+  y = x >> 16;
+  if (y) {
+    n = n - 16;
+    x = y;
+  }
+  y = x >> 8;
+  if (y) {
+    n = n - 8;
+    x = y;
+  }
+  y = x >> 4;
+  if (y) {
+    n = n - 4;
+    x = y;
+  }
+  y = x >> 2;
+  if (y) {
+    n = n - 2;
+    x = y;
+  }
+  y = x >> 1;
+  if (y) return n - 2;
   return n - x;
 }
 
-template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
-          typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS, typename DST_T, typename DST_REP_T,
+          int DST_SIG_BITS>
 static inline DST_T __truncXfYf2__(SRC_T a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
@@ -71,7 +88,10 @@ static inline DST_T __truncXfYf2__(SRC_T a) {
   const DST_REP_T dstNaNCode = dstQNaN - 1;
 
   // Break a into a sign and representation of the absolute value
-  union SrcExchangeType { SRC_T f; SRC_REP_T i; };
+  union SrcExchangeType {
+    SRC_T f;
+    SRC_REP_T i;
+  };
   SrcExchangeType src_rep;
   src_rep.f = a;
   const SRC_REP_T aRep = src_rep.i;
@@ -88,25 +108,21 @@ static inline DST_T __truncXfYf2__(SRC_T a) {
 
     const SRC_REP_T roundBits = aAbs & roundMask;
     // Round to nearest
-    if (roundBits > halfway)
-      absResult++;
-      // Ties to even
+    if (roundBits > halfway) absResult++;
+    // Ties to even
     else if (roundBits == halfway)
       absResult += absResult & 1;
-  }
-  else if (aAbs > srcInfinity) {
+  } else if (aAbs > srcInfinity) {
     // a is NaN.
     // Conjure the result by beginning with infinity, setting the qNaN
     // bit and inserting the (truncated) trailing NaN field.
     absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
     absResult |= dstQNaN;
     absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode;
-  }
-  else if (aAbs >= overflow) {
+  } else if (aAbs >= overflow) {
     // a overflows to infinity.
     absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
-  }
-  else {
+  } else {
     // a underflows on conversion to the destination type or is an exact
     // zero.  The result may be a denormal or zero.  Extract the exponent
     // to get the shift amount for the denormalization.
@@ -124,24 +140,26 @@ static inline DST_T __truncXfYf2__(SRC_T a) {
       absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS);
       const SRC_REP_T roundBits = denormalizedSignificand & roundMask;
       // Round to nearest
-      if (roundBits > halfway)
-        absResult++;
-        // Ties to even
+      if (roundBits > halfway) absResult++;
+      // Ties to even
       else if (roundBits == halfway)
         absResult += absResult & 1;
     }
   }
 
   // Apply the signbit to (DST_T)abs(a).
   const DST_REP_T result = absResult | sign >> (srcBits - dstBits);
-  union DstExchangeType { DST_T f; DST_REP_T i; };
+  union DstExchangeType {
+    DST_T f;
+    DST_REP_T i;
+  };
   DstExchangeType dst_rep;
   dst_rep.i = result;
   return dst_rep.f;
 }
 
-template<typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS,
-         typename DST_T, typename DST_REP_T, int DST_SIG_BITS>
+template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS, typename DST_T, typename DST_REP_T,
+          int DST_SIG_BITS>
 static inline DST_T __extendXfYf2__(SRC_T a) {
   // Various constants whose values follow from the type parameters.
   // Any reasonable optimizer will fold and propagate all of these.
@@ -157,15 +175,18 @@ static inline DST_T __extendXfYf2__(SRC_T a) {
   const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1);
   const SRC_REP_T srcNaNCode = srcQNaN - 1;
 
-  const int dstBits = sizeof(DST_T)*8;
+  const int dstBits = sizeof(DST_T) * 8;
   const int dstExpBits = dstBits - DST_SIG_BITS - 1;
   const int dstInfExp = (1 << dstExpBits) - 1;
   const int dstExpBias = dstInfExp >> 1;
 
   const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS;
 
   // Break a into a sign and representation of the absolute value
-  union SrcExchangeType { SRC_T f; SRC_REP_T i; };
+  union SrcExchangeType {
+    SRC_T f;
+    SRC_REP_T i;
+  };
   SrcExchangeType src_rep;
   src_rep.f = a;
   const SRC_REP_T aRep = src_rep.i;
@@ -191,8 +212,7 @@ static inline DST_T __extendXfYf2__(SRC_T a) {
     absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS;
     absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS);
     absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS);
-  }
-  else if (aAbs) {
+  } else if (aAbs) {
     // a is denormal.
     // renormalize the significand and clear the leading bit, then insert
     // the correct adjusted exponent in the destination type.
@@ -201,15 +221,17 @@ static inline DST_T __extendXfYf2__(SRC_T a) {
     absResult ^= dstMinNormal;
     const int resultExponent = dstExpBias - srcExpBias - scale + 1;
     absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS;
-  }
-  else {
+  } else {
     // a is zero.
     absResult = 0;
   }
 
   // Apply the signbit to (DST_T)abs(a).
   const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits);
-  union DstExchangeType { DST_T f; DST_REP_T i; };
+  union DstExchangeType {
+    DST_T f;
+    DST_REP_T i;
+  };
   DstExchangeType dst_rep;
   dst_rep.i = result;
   return dst_rep.f;