From 7a6a9e0f275225d4e99d7007fd876e37c45a3577 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 23 Jul 2015 01:44:52 -0400
Subject: [PATCH 01/11] WIP: start implementation of generic versions of the
 julia intrinsic functions

---
 base/pointer.jl          |   4 +-
 src/APInt-C.cpp          | 486 ++++++++++++++++++++
 src/APInt-C.h            |  78 ++++
 src/Makefile             |   2 +-
 src/alloc.c              |  40 --
 src/codegen.cpp          |   1 +
 src/intrinsics.cpp       | 231 +++++++---
 src/julia_internal.h     |  92 ++++
 src/runtime_intrinsics.c | 930 +++++++++++++++++++++++++++++++++++++++
 9 files changed, 1764 insertions(+), 100 deletions(-)
 create mode 100644 src/APInt-C.cpp
 create mode 100644 src/APInt-C.h
 create mode 100644 src/runtime_intrinsics.c

diff --git a/base/pointer.jl b/base/pointer.jl
index 1956ef2263b66..c532b3e3053e3 100644
--- a/base/pointer.jl
+++ b/base/pointer.jl
@@ -5,7 +5,7 @@
 const C_NULL = box(Ptr{Void}, 0)
 
 # pointer to integer
-convert{T<:Union{Int,UInt}}(::Type{T}, x::Ptr) = box(T, unbox(Ptr,x))
+convert{T<:Union{Int,UInt}}(::Type{T}, x::Ptr) = box(T, unbox(Ptr{Void},x))
 convert{T<:Integer}(::Type{T}, x::Ptr) = convert(T,convert(UInt, x))
 
 # integer to pointer
@@ -14,7 +14,7 @@ convert{T}(::Type{Ptr{T}}, x::Int) = box(Ptr{T},unbox(Int,Int(x)))
 
 # pointer to pointer
 convert{T}(::Type{Ptr{T}}, p::Ptr{T}) = p
-convert{T}(::Type{Ptr{T}}, p::Ptr) = box(Ptr{T}, unbox(Ptr,p))
+convert{T}(::Type{Ptr{T}}, p::Ptr) = box(Ptr{T}, unbox(Ptr{Void},p))
 
 # object to pointer (when used with ccall)
 unsafe_convert(::Type{Ptr{UInt8}}, x::Symbol) = ccall(:jl_symbol_name, Ptr{UInt8}, (Any,), x)
diff --git a/src/APInt-C.cpp b/src/APInt-C.cpp
new file mode 100644
index 0000000000000..16a4c3059493e
--- /dev/null
+++ b/src/APInt-C.cpp
@@ -0,0 +1,486 @@
+#include "llvm-version.h"
+#include <llvm/ADT/APInt.h>
+#include <llvm/ADT/APFloat.h>
+#include <llvm/Support/MathExtras.h>
+
+#define DLLEXPORT
+extern "C" DLLEXPORT void jl_error(const char *str);
+
+using namespace llvm;
+
+/* create "APInt s" from "integerPart *ps" */
+#define CREATE(s) \
+    APInt s; \
+    if ((numbits % integerPartWidth) != 0) { \
+        /* use LLT_ALIGN to round the memory area up to the nearest integerPart-sized chunk */ \
+        unsigned nbytes = RoundUpToAlignment(numbits, integerPartWidth) / host_char_bit; \
+        integerPart *data_a64 = (integerPart*)alloca(nbytes); \
+        /* TODO: this memcpy assumes little-endian,
+         * for big-endian, need to align the copy to the other end */ \
+        memcpy(data_a64, p##s, RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \
+        s = APInt(numbits, makeArrayRef(data_a64, nbytes / sizeof(integerPart))); \
+    } \
+    else { \
+        s = APInt(numbits, makeArrayRef(p##s, numbits / integerPartWidth)); \
+    }
+
+/* assign to "integerPart *pr" from "APInt a" */
+#define ASSIGN(r, a) \
+    if (numbits <= 8) \
+        *(uint8_t*)p##r = a.getZExtValue(); \
+    else if (numbits <= 16) \
+        *(uint16_t*)p##r = a.getZExtValue(); \
+    else if (numbits <= 32) \
+        *(uint32_t*)p##r = a.getZExtValue(); \
+    else if (numbits <= 64) \
+        *(uint64_t*)p##r = a.getZExtValue(); \
+    else \
+        memcpy(p##r, a.getRawData(), RoundUpToAlignment(numbits, host_char_bit) / host_char_bit); \
+
+extern "C" DLLEXPORT
+void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr) {
+    APInt z(numbits, 0);
+    CREATE(a)
+    z -= a;
+    ASSIGN(r, z)
+}
+
+extern "C" DLLEXPORT
+void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a += b;
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a -= b;
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a *= b;
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.sdiv(b);
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.udiv(b);
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.srem(b);
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.urem(b);
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pb) {
+    CREATE(a)
+    CREATE(b)
+    return a.eq(b);
+}
+
+extern "C" DLLEXPORT
+int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb) {
+    CREATE(a)
+    CREATE(b)
+    return a.ne(b);
+}
+
+extern "C" DLLEXPORT
+int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb) {
+    CREATE(a)
+    CREATE(b)
+    return a.slt(b);
+}
+
+extern "C" DLLEXPORT
+int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb) {
+    CREATE(a)
+    CREATE(b)
+    return a.ult(b);
+}
+
+extern "C" DLLEXPORT
+int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb) {
+    CREATE(a)
+    CREATE(b)
+    return a.sle(b);
+}
+
+extern "C" DLLEXPORT
+int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb) {
+    CREATE(a)
+    CREATE(b)
+    return a.ule(b);
+}
+
+extern "C" DLLEXPORT
+void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a &= b;
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a |= b;
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a ^= b;
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.shl(b);
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.lshr(b);
+    ASSIGN(r, a)
+}
+extern "C" DLLEXPORT
+void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    a = a.ashr(b);
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr) {
+    CREATE(a)
+    a.flipAllBits();
+    ASSIGN(r, a)
+}
+
+extern "C" DLLEXPORT
+int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    bool Overflow;
+    a = a.uadd_ov(b, Overflow);
+    ASSIGN(r, a)
+    return Overflow;
+}
+
+extern "C" DLLEXPORT
+int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    bool Overflow;
+    a = a.sadd_ov(b, Overflow);
+    ASSIGN(r, a)
+    return Overflow;
+}
+
+extern "C" DLLEXPORT
+int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    bool Overflow;
+    a = a.usub_ov(b, Overflow);
+    ASSIGN(r, a)
+    return Overflow;
+}
+
+extern "C" DLLEXPORT
+int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    bool Overflow;
+    a = a.ssub_ov(b, Overflow);
+    ASSIGN(r, a)
+    return Overflow;
+}
+
+extern "C" DLLEXPORT
+int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    bool Overflow;
+    a = a.smul_ov(b, Overflow);
+    ASSIGN(r, a)
+    return Overflow;
+}
+
+extern "C" DLLEXPORT
+int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    bool Overflow;
+    a = a.umul_ov(b, Overflow);
+    ASSIGN(r, a)
+    return Overflow;
+}
+
+extern "C" DLLEXPORT
+void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr) {
+    CREATE(a)
+    a = a.byteSwap();
+    ASSIGN(r, a)
+}
+
+void LLVMFPtoInt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr, bool isSigned, bool *isExact) {
+    double Val;
+    if (numbits == 32)
+        Val = *(float*)pa;
+    else if (numbits == 64)
+        Val = *(double*)pa;
+    else
+        jl_error("FPtoSI: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+    if (onumbits <= 64) { // fast-path, if possible
+        if (isSigned) {
+            int64_t ia = Val;
+            memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian
+            if (isExact) {
+                // check whether the conversion was lossless
+                int64_t ia2 = ia < 0 ? -1 : 0;
+                memcpy(&ia2, pr, onumbytes);
+                *isExact = (Val == (double)ia2 && ia == ia2);
+            }
+        }
+        else {
+            uint64_t ia = Val;
+            memcpy(pr, &ia, onumbytes); // TODO: assumes little-endian
+            if (isExact) {
+                // check whether the conversion was lossless
+                uint64_t ia2 = 0;
+                memcpy(&ia2, pr, onumbytes);
+                *isExact = (Val == (double)ia2 && ia == ia2);
+            }
+        }
+    }
+    else {
+        APFloat a(Val);
+        bool isVeryExact;
+        APFloat::roundingMode rounding_mode = APFloat::rmNearestTiesToEven;
+        unsigned nbytes = RoundUpToAlignment(onumbits, integerPartWidth) / host_char_bit;
+        integerPart *parts = (integerPart*)alloca(nbytes);
+        APFloat::opStatus status = a.convertToInteger(parts, onumbits, isSigned, rounding_mode, &isVeryExact);
+        memcpy(pr, parts, onumbytes);
+        if (isExact)
+            *isExact = (status == APFloat::opOK);
+    }
+}
+
+extern "C" DLLEXPORT
+void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    LLVMFPtoInt(numbits, pa, onumbits, pr, true, NULL);
+}
+
+extern "C" DLLEXPORT
+void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    LLVMFPtoInt(numbits, pa, onumbits, pr, false, NULL);
+}
+
+extern "C" DLLEXPORT
+int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    bool isExact;
+    LLVMFPtoInt(numbits, pa, onumbits, pr, true, &isExact);
+    return isExact;
+}
+
+extern "C" DLLEXPORT
+int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    bool isExact;
+    LLVMFPtoInt(numbits, pa, onumbits, pr, false, &isExact);
+    return isExact;
+}
+
+extern "C" DLLEXPORT
+void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    CREATE(a)
+    double val = a.roundToDouble(true);
+    if (onumbits == 32)
+        *(float*)pr = val;
+    else if (onumbits == 64)
+        *(double*)pr = val;
+    else
+        jl_error("SItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+}
+
+extern "C" DLLEXPORT
+void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    CREATE(a)
+    double val = a.roundToDouble(false);
+    if (onumbits == 32)
+        *(float*)pr = val;
+    else if (onumbits == 64)
+        *(double*)pr = val;
+    else
+        jl_error("UItoFP: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+}
+
+extern "C" DLLEXPORT
+void LLVMSExt(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    assert(inumbits < onumbits);
+    unsigned inumbytes = RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit;
+    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+    int bits = (0 - inumbits) % host_char_bit;
+    int signbit = (inumbits - 1) % host_char_bit;
+    int sign = ((unsigned char*)pa)[inumbytes - 1] & (1 << signbit) ? -1 : 0;
+    // copy over the input bytes
+    memcpy(pr, pa, inumbytes);
+    if (bits) {
+        // sign-extend the partial byte
+        ((signed char*)pr)[inumbytes - 1] = ((signed char*)pa)[inumbytes - 1] << bits >> bits;
+    }
+    // sign-extend the rest of the bytes
+    memset((char*)pr + inumbytes, sign, onumbytes - inumbytes);
+}
+
+extern "C" DLLEXPORT
+void LLVMZExt(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    assert(inumbits < onumbits);
+    unsigned inumbytes = RoundUpToAlignment(inumbits, host_char_bit) / host_char_bit;
+    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+    int bits = (0 - inumbits) % host_char_bit;
+    // copy over the input bytes
+    memcpy(pr, pa, inumbytes);
+    if (bits) {
+        // zero the remaining bits of the partial byte
+        ((unsigned char*)pr)[inumbytes - 1] = ((unsigned char*)pa)[inumbytes - 1] << bits >> bits;
+    }
+    // zero-extend the rest of the bytes
+    memset((char*)pr + inumbytes, 0, onumbytes - inumbytes);
+}
+
+extern "C" DLLEXPORT
+void LLVMTrunc(unsigned inumbits, integerPart *pa, unsigned onumbits, integerPart *pr) {
+    assert(inumbits > onumbits);
+    unsigned onumbytes = RoundUpToAlignment(onumbits, host_char_bit) / host_char_bit;
+    memcpy(pr, pa, onumbytes);
+}
+
+extern "C" DLLEXPORT
+unsigned countTrailingZeros_8(uint8_t Val) {
+#ifdef LLVM35
+    return countTrailingZeros(Val);
+#else
+    return CountTrailingZeros_32(Val);
+#endif
+}
+
+extern "C" DLLEXPORT
+unsigned countTrailingZeros_16(uint16_t Val) {
+#ifdef LLVM35
+    return countTrailingZeros(Val);
+#else
+    return CountTrailingZeros_32(Val);
+#endif
+}
+
+extern "C" DLLEXPORT
+unsigned countTrailingZeros_32(uint32_t Val) {
+#ifdef LLVM35
+    return countTrailingZeros(Val);
+#else
+    return CountTrailingZeros_32(Val);
+#endif
+}
+
+extern "C" DLLEXPORT
+unsigned countTrailingZeros_64(uint64_t Val) {
+#ifdef LLVM35
+    return countTrailingZeros(Val);
+#else
+    return CountTrailingZeros_64(Val);
+#endif
+}
+
+extern "C" DLLEXPORT
+void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    CREATE(a)
+    CREATE(b)
+    APInt r = a.srem(b);
+    if (a.isNegative() != b.isNegative()) {
+        r = (b + r).srem(b);
+    }
+    ASSIGN(r, r)
+}
+
+extern "C" DLLEXPORT
+void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr) {
+    unsigned numbytes = RoundUpToAlignment(numbits, host_char_bit) / host_char_bit;
+    int signbit = (numbits - 1) % host_char_bit;
+    int sign = ((unsigned char*)pa)[numbytes - 1] & (1 << signbit) ? -1 : 0;
+    if (sign)
+        LLVMNeg(numbits, pa, pr);
+    else
+        memcpy(pr, pa,  numbytes);
+}
+
+extern "C" DLLEXPORT
+unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa) {
+    CREATE(a)
+    return a.countPopulation();
+}
+
+extern "C" DLLEXPORT
+unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa) {
+    CREATE(a)
+    return a.countTrailingOnes();
+}
+
+extern "C" DLLEXPORT
+unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa) {
+    CREATE(a)
+    return a.countTrailingZeros();
+}
+
+extern "C" DLLEXPORT
+unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa) {
+    CREATE(a)
+    return a.countLeadingOnes();
+}
+
+extern "C" DLLEXPORT
+unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa) {
+    CREATE(a)
+    return a.countLeadingZeros();
+}
diff --git a/src/APInt-C.h b/src/APInt-C.h
new file mode 100644
index 0000000000000..da7c08652d2a8
--- /dev/null
+++ b/src/APInt-C.h
@@ -0,0 +1,78 @@
+
+#ifndef APINT_C_H
+#define APINT_C_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+typedef void integerPart;
+
+void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr);
+void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr);
+
+void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+
+void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr);
+
+int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pr);
+int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb);
+int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb);
+int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb);
+int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb);
+int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb);
+
+int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+
+unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa);
+unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa);
+unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa);
+unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa);
+unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa);
+
+void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+void LLVMTrunc(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+
+int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+
+void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+
+unsigned countTrailingZeros_8(uint8_t Val);
+unsigned countTrailingZeros_16(uint16_t Val);
+unsigned countTrailingZeros_32(uint32_t Val);
+unsigned countTrailingZeros_64(uint64_t Val);
+
+uint8_t getSwappedBytes_8(uint8_t Value); // no-op
+uint16_t getSwappedBytes_16(uint16_t Value);
+uint32_t getSwappedBytes_32(uint32_t Value);
+uint64_t getSwappedBytes_64(uint64_t Value);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/Makefile b/src/Makefile
index f403327b5fc46..2731b897445e2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -14,7 +14,7 @@ override CPPFLAGS += $(JCPPFLAGS)
 SRCS := \
 	jltypes gf ast builtins module codegen disasm debuginfo interpreter \
 	alloc dlload sys init task array dump toplevel jl_uv jlapi signal-handling \
-	llvm-simdloop simplevector
+	llvm-simdloop simplevector APInt-C runtime_intrinsics
 ifeq ($(JULIAGC),MARKSWEEP)
 SRCS += gc
 endif
diff --git a/src/alloc.c b/src/alloc.c
index ab453d5a8e264..52be1ed3b2a2b 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -159,25 +159,6 @@ jl_value_t *jl_new_bits(jl_value_t *bt, void *data)
     return jl_new_bits_internal(bt, data, &len);
 }
 
-// run time version of pointerref intrinsic (warning: i is not rooted)
-DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i)
-{
-    JL_TYPECHK(pointerref, pointer, p);
-    JL_TYPECHK(pointerref, long, i);
-    jl_value_t *ety = jl_tparam0(jl_typeof(p));
-    if (ety == (jl_value_t*)jl_any_type) {
-        jl_value_t **pp = (jl_value_t**)(jl_unbox_long(p) + (jl_unbox_long(i)-1)*sizeof(void*));
-        return *pp;
-    }
-    else {
-        if (!jl_is_datatype(ety))
-            jl_error("pointerref: invalid pointer");
-        size_t nb = LLT_ALIGN(jl_datatype_size(ety), ((jl_datatype_t*)ety)->alignment);
-        char *pp = (char*)jl_unbox_long(p) + (jl_unbox_long(i)-1)*nb;
-        return jl_new_bits(ety, pp);
-    }
-}
-
 void jl_assign_bits(void *dest, jl_value_t *bits)
 {
     size_t nb = jl_datatype_size(jl_typeof(bits));
@@ -192,27 +173,6 @@ void jl_assign_bits(void *dest, jl_value_t *bits)
     }
 }
 
-// run time version of pointerset intrinsic (warning: x is not gc-rooted)
-DLLEXPORT void jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *i)
-{
-    JL_TYPECHK(pointerset, pointer, p);
-    JL_TYPECHK(pointerset, long, i);
-    jl_value_t *ety = jl_tparam0(jl_typeof(p));
-    if (ety == (jl_value_t*)jl_any_type) {
-        jl_value_t **pp = (jl_value_t**)(jl_unbox_long(p) + (jl_unbox_long(i)-1)*sizeof(void*));
-        *pp = x;
-    }
-    else {
-        if (!jl_is_datatype(ety))
-            jl_error("pointerset: invalid pointer");
-        size_t nb = LLT_ALIGN(jl_datatype_size(ety), ((jl_datatype_t*)ety)->alignment);
-        char *pp = (char*)jl_unbox_long(p) + (jl_unbox_long(i)-1)*nb;
-        if (jl_typeof(x) != ety)
-            jl_error("pointerset: type mismatch in assign");
-        jl_assign_bits(pp, x);
-    }
-}
-
 int jl_field_index(jl_datatype_t *t, jl_sym_t *fld, int err)
 {
     jl_svec_t *fn = t->name->names;
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 3e88bdfc56bf0..d88be7d4be5ed 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -6121,6 +6121,7 @@ extern "C" void jl_init_codegen(void)
                               "jl_box32", (void*)&jl_box32, m);
     box64_func = boxfunc_llvm(ft2arg(T_pjlvalue, T_pjlvalue, T_int64),
                               "jl_box64", (void*)&jl_box64, m);
+    jl_init_intrinsic_functions_codegen(m);
 }
 
 // for debugging from gdb
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index a1bcb335872f0..d8db10bfe7288 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -43,16 +43,21 @@ namespace JL_I {
         // pointer access
         pointerref, pointerset,
         // c interface
-        ccall, cglobal, llvmcall
+        ccall, cglobal, llvmcall,
+        // terminator
+        fptoui_auto, fptosi_auto,
+        num_intrinsics
     };
 }
 
 using namespace JL_I;
+Function *runtime_func[num_intrinsics];
+unsigned intrinsic_nargs[num_intrinsics];
 
 #include "ccall.cpp"
 
 /*
-  low-level intrinsics design:
+  low-level intrinsics design: TODO: fix description below
   functions like add_int expect unboxed values of matching bit-length.
   every operation that can return an unboxed value does so.
   this maximizes opportunities for composing functions without
@@ -107,12 +112,22 @@ static Type *JL_INTT(Type *t)
     assert(t == T_void);
     return T_void;
 }
-static jl_value_t *JL_JLINTT(Type *t)
+// convert float type to same-size int type (as a Julia type)
+static jl_value_t *JL_JLUINTT(Type *t)
 {
     assert(!t->isIntegerTy());
-    if (t == T_float32) return (jl_value_t*)jl_float32_type;
-    if (t == T_float64) return (jl_value_t*)jl_float64_type;
-    if (t == Type::getHalfTy(jl_LLVMContext)) return jl_get_global(jl_base_module, jl_symbol("Float16"));
+    if (t == T_float32) return (jl_value_t*)jl_uint32_type;
+    if (t == T_float64) return (jl_value_t*)jl_uint64_type;
+    if (t == Type::getHalfTy(jl_LLVMContext)) return (jl_value_t*)jl_uint16_type;
+    assert(t == T_void);
+    return jl_bottom_type;
+}
+static jl_value_t *JL_JLSINTT(Type *t)
+{
+    assert(!t->isIntegerTy());
+    if (t == T_float32) return (jl_value_t*)jl_int32_type;
+    if (t == T_float64) return (jl_value_t*)jl_int64_type;
+    if (t == Type::getHalfTy(jl_LLVMContext)) return (jl_value_t*)jl_int16_type;
     assert(t == T_void);
     return jl_bottom_type;
 }
@@ -413,10 +428,7 @@ static jl_cgval_t generic_box(jl_value_t *targ, jl_value_t *x, jl_codectx_t *ctx
         }
         else {
             if (!jl_is_leaf_type(v.typ) && !jl_is_bitstype(v.typ)) {
-                // TODO: currently doesn't handle the case where the type of neither argument is understood at compile time
-                // since codegen has no idea what size it might have
-                jl_error("codegen: failed during evaluation of a call to reinterpret");
-                return jl_cgval_t();
+                return jl_cgval_t(); // TODO: XXX
             }
             nb = jl_datatype_size(v.typ);
             llvmt = staticeval_bitstype(v.typ);
@@ -820,25 +832,59 @@ static Value *emit_smod(Value *x, Value *den, jl_codectx_t *ctx)
 }
 
 #define HANDLE(intr,n)                                                  \
-    case intr: if (nargs!=n) jl_error(#intr": wrong number of arguments");
+    case intr:
 
 static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z, size_t nargs,
                                        jl_codectx_t *ctx, jl_datatype_t* *newtyp);
 static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
                                        jl_codectx_t *ctx)
 {
+    assert(f < num_intrinsics);
+    if (f == fptoui && nargs == 1)
+        f = fptoui_auto;
+    if (f == fptosi && nargs == 1)
+        f = fptosi_auto;
+    unsigned expected_nargs = intrinsic_nargs[f];
+    if (expected_nargs && expected_nargs != nargs) {
+        jl_errorf("intrinsic #%d: wrong number of arguments", f);
+    }
+
     switch (f) {
     case ccall: return emit_ccall(args, nargs, ctx);
     case cglobal: return emit_cglobal(args, nargs, ctx);
     case llvmcall: return emit_llvmcall(args, nargs, ctx);
-
-    HANDLE(pointerref,2)
+#if 0
+    default:
+        int ldepth = ctx->gc.argDepth;
+        Value *r;
+        if (nargs == 1) {
+            Value *x = emit_boxed_rooted(args[1], ctx).V;
+            r = builder.CreateCall(runtime_func[f], x);
+        }
+        else if (nargs == 2) {
+            Value *x = emit_boxed_rooted(args[1], ctx).V;
+            Value *y = emit_boxed_rooted(args[2], ctx).V;
+            r = builder.CreateCall2(runtime_func[f], x, y);
+        }
+        else if (nargs == 3) {
+            Value *x = emit_boxed_rooted(args[1], ctx).V;
+            Value *y = emit_boxed_rooted(args[2], ctx).V;
+            Value *z = emit_boxed_rooted(args[3], ctx).V;
+            r = builder.CreateCall3(runtime_func[f], x, y, z);
+        }
+        else {
+            assert(0);
+        }
+        ctx->gc.argDepth = ldepth;
+        return mark_julia_type(r, true, (jl_value_t*)jl_any_type);
+#else
+    case pointerref:
         return emit_pointerref(args[1], args[2], ctx);
-    HANDLE(pointerset,3)
+    case pointerset:
         return emit_pointerset(args[1], args[2], args[3], ctx);
-    HANDLE(box,2)
+    case box:
         return generic_box(args[1], args[2], ctx);
-    HANDLE(unbox,2) // TODO: deprecate this
+    case unbox:
         return generic_box(args[1], args[2], ctx);
     HANDLE(trunc_int,2)
         return generic_trunc(args[1], args[2], ctx, false, false);
@@ -873,46 +919,38 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return mark_julia_type(builder.CreateSIToFP(xi, FTnbits(nb)), false, bt);
     }
 
-    case fptoui:
-        if (nargs == 1) {
-            Value *x = FP(auto_unbox(args[1], ctx));
-            if (x->getType() == T_void) return jl_cgval_t(); // auto_unbox threw an error
-            return mark_julia_type(
-                    builder.CreateFPToUI(FP(x), JL_INTT(x->getType())),
-                    false,
-                    JL_JLINTT(x->getType()));
-        }
-        else if (nargs == 2) {
-            jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
-            if (!bt) return jl_cgval_t();
-            int nb = get_bitstype_nbits(bt);
-            Value *xf = FP(auto_unbox(args[2],ctx));
-            if (xf->getType() == T_void) return jl_cgval_t(); // auto_unbox threw an error
-            return mark_julia_type(builder.CreateFPToUI(xf, Type::getIntNTy(jl_LLVMContext, nb)), false, bt);
-        }
-        else {
-            jl_error("fptoui: wrong number of arguments");
-        }
+    case fptoui_auto: {
+        Value *x = FP(auto_unbox(args[1], ctx));
+        if (x->getType() == T_void) return jl_cgval_t(); // auto_unbox threw an error
+        return mark_julia_type(
+                builder.CreateFPToUI(FP(x), JL_INTT(x->getType())),
+                false,
+                JL_JLUINTT(x->getType()));
+    }
+    case fptoui: {
+        jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
+        if (!bt) return jl_cgval_t();
+        int nb = get_bitstype_nbits(bt);
+        Value *xf = FP(auto_unbox(args[2],ctx));
+        if (xf->getType() == T_void) return jl_cgval_t(); // auto_unbox threw an error
+        return mark_julia_type(builder.CreateFPToUI(xf, Type::getIntNTy(jl_LLVMContext, nb)), false, bt);
+    }
 
-    case fptosi:
-        if (nargs == 1) {
-            Value *x = FP(auto_unbox(args[1], ctx));
-            return mark_julia_type(
-                    builder.CreateFPToSI(FP(x), JL_INTT(x->getType())),
-                    false,
-                    JL_JLINTT(x->getType()));
-        }
-        else if (nargs == 2) {
-            jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
-            if (!bt) return jl_cgval_t();
-            int nb = get_bitstype_nbits(bt);
-            Value *xf = FP(auto_unbox(args[2],ctx));
-            if (xf->getType() == T_void) return jl_cgval_t(); // auto_unbox threw an error
-            return mark_julia_type(builder.CreateFPToSI(xf, Type::getIntNTy(jl_LLVMContext, nb)), false, bt);
-        }
-        else {
-            jl_error("fptosi: wrong number of arguments");
-        }
+    case fptosi_auto: {
+        Value *x = FP(auto_unbox(args[1], ctx));
+        return mark_julia_type(
+                builder.CreateFPToSI(FP(x), JL_INTT(x->getType())),
+                false,
+                JL_JLSINTT(x->getType()));
+    }
+    case fptosi: {
+        jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
+        if (!bt) return jl_cgval_t();
+        int nb = get_bitstype_nbits(bt);
+        Value *xf = FP(auto_unbox(args[2],ctx));
+        if (xf->getType() == T_void) return jl_cgval_t(); // auto_unbox threw an error
+        return mark_julia_type(builder.CreateFPToSI(xf, Type::getIntNTy(jl_LLVMContext, nb)), false, bt);
+    }
 
     HANDLE(fptrunc,2) {
         jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
@@ -1016,7 +1054,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
             r = builder.CreateBitCast(r, x->getType());
         return mark_julia_type(r, false, newtyp ? newtyp : xinfo.typ);
     }
-
+#endif
     }
     assert(0);
 }
@@ -1385,7 +1423,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
     HANDLE(powi_llvm,2) {
         x = FP(x);
         y = JL_INT(y);
-        Type *tx = x->getType();
+        Type *tx = x->getType(); // TODO: LLVM expects this to be i32
 #ifdef LLVM36
         Type *ts[1] = { tx };
         Value *powi = Intrinsic::getDeclaration(jl_Module, Intrinsic::powi,
@@ -1409,6 +1447,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                                             ArrayRef<Type*>(x->getType())),
                                   x);
     }
+
     default:
         assert(false);
     }
@@ -1464,6 +1503,7 @@ extern "C" void jl_init_intrinsic_functions(void)
     jl_module_t *inm = jl_new_module(jl_symbol("Intrinsics"));
     inm->parent = jl_core_module;
     jl_set_const(jl_core_module, jl_symbol("Intrinsics"), (jl_value_t*)inm);
+
     ADD_I(box); ADD_I(unbox);
     ADD_I(neg_int); ADD_I(add_int); ADD_I(sub_int); ADD_I(mul_int);
     ADD_I(sdiv_int); ADD_I(udiv_int); ADD_I(srem_int); ADD_I(urem_int);
@@ -1501,6 +1541,83 @@ extern "C" void jl_init_intrinsic_functions(void)
     ADD_I(checked_trunc_uint);
     ADD_I(check_top_bit);
     ADD_I(nan_dom_err);
+    //ADD_I(fptosi_auto); ADD_I(fptoui_auto); // these intrinsics are "hidden" in fpto*i
     ADD_I(ccall); ADD_I(cglobal);
     ADD_I(llvmcall);
 }
+#undef ADD_I
+
+static void add_intrinsic_to_codegen(Module *m, const std::string &name, intrinsic f,
+        unsigned nargs, std::vector<Type *> args, void *pfunc) {
+    Function *func = Function::Create(FunctionType::get(T_pjlvalue, args, false),
+                                      Function::ExternalLinkage, name, m);
+    runtime_func[f] = func;
+    add_named_global(func, pfunc);
+    intrinsic_nargs[f] = nargs;
+}
+
+static void add_intrinsic_to_codegen(intrinsic alias, intrinsic base)
+{
+    runtime_func[alias] = runtime_func[base];
+    intrinsic_nargs[alias] = intrinsic_nargs[base];
+}
+
+#define ADD_I(name, nargs) add_intrinsic_to_codegen(m, "jl_" #name, name, nargs, args##nargs, (void*)&jl_##name)
+#define ALIAS(alias, base) add_intrinsic_to_codegen(alias, base)
+
+static void jl_init_intrinsic_functions_codegen(Module *m)
+{
+    std::vector<Type *> args1(0);
+    args1.push_back(T_pjlvalue);
+    std::vector<Type *> args2(0);
+    args2.push_back(T_pjlvalue);
+    args2.push_back(T_pjlvalue);
+    std::vector<Type *> args3(0);
+    args3.push_back(T_pjlvalue);
+    args3.push_back(T_pjlvalue);
+    args3.push_back(T_pjlvalue);
+
+    add_intrinsic_to_codegen(m, "jl_reinterpret", box,
+        2, args2, (void*)&jl_reinterpret);
+    ALIAS(unbox, box);
+    ADD_I(neg_int, 1); ADD_I(add_int, 2); ADD_I(sub_int, 2); ADD_I(mul_int, 2);
+    ADD_I(sdiv_int, 2); ADD_I(udiv_int, 2); ADD_I(srem_int, 2); ADD_I(urem_int, 2);
+    ADD_I(smod_int, 2);
+    ADD_I(neg_float, 1); ADD_I(add_float, 2); ADD_I(sub_float, 2); ADD_I(mul_float, 2);
+    ADD_I(div_float, 2); ADD_I(rem_float, 2); ADD_I(fma_float, 3); ADD_I(muladd_float, 3);
+    ALIAS(neg_float_fast, neg_float); ALIAS(add_float_fast, add_float); ALIAS(sub_float_fast, sub_float);
+    ALIAS(mul_float_fast, mul_float); ALIAS(div_float_fast, div_float); ALIAS(rem_float_fast, rem_float);
+    ADD_I(eq_int, 2); ADD_I(ne_int, 2);
+    ADD_I(slt_int, 2); ADD_I(ult_int, 2);
+    ADD_I(sle_int, 2); ADD_I(ule_int, 2);
+    ADD_I(eq_float, 2); ADD_I(ne_float, 2);
+    ADD_I(lt_float, 2); ADD_I(le_float, 2);
+    ALIAS(eq_float_fast, eq_float); ALIAS(ne_float_fast, ne_float);
+    ALIAS(lt_float_fast, lt_float); ALIAS(le_float_fast, le_float);
+    ADD_I(fpiseq, 2); ADD_I(fpislt, 2);
+    ADD_I(and_int, 2); ADD_I(or_int, 2); ADD_I(xor_int, 2); ADD_I(not_int, 1);
+    ADD_I(shl_int, 2); ADD_I(lshr_int, 2); ADD_I(ashr_int, 2); ADD_I(bswap_int, 1);
+    ADD_I(ctpop_int, 1); ADD_I(ctlz_int, 1); ADD_I(cttz_int, 1);
+    ADD_I(sext_int, 2); ADD_I(zext_int, 2); ADD_I(trunc_int, 2);
+    ADD_I(fptoui, 2); ADD_I(fptosi, 2);
+    ADD_I(uitofp, 2); ADD_I(sitofp, 2);
+    ADD_I(fptrunc, 2); ADD_I(fpext, 2);
+    ADD_I(abs_float, 1); ADD_I(copysign_float, 2);
+    ADD_I(flipsign_int, 2); ADD_I(select_value, 3);
+    ADD_I(ceil_llvm, 1); ADD_I(floor_llvm, 1); ADD_I(trunc_llvm, 1); ADD_I(rint_llvm, 1);
+    ADD_I(sqrt_llvm, 1); ADD_I(powi_llvm, 2);
+    ALIAS(sqrt_llvm_fast, sqrt_llvm);
+    ADD_I(pointerref, 2); ADD_I(pointerset, 3);
+    ADD_I(checked_sadd, 2); ADD_I(checked_uadd, 2);
+    ADD_I(checked_ssub, 2); ADD_I(checked_usub, 2);
+    ADD_I(checked_smul, 2); ADD_I(checked_umul, 2);
+    ADD_I(checked_fptosi, 2); ADD_I(checked_fptoui, 2);
+    ADD_I(checked_trunc_sint, 2);
+    ADD_I(checked_trunc_uint, 2);
+    ADD_I(check_top_bit, 1);
+    ADD_I(nan_dom_err, 2);
+    ADD_I(fptosi_auto, 1); ADD_I(fptoui_auto, 1);
+}
+
+#undef ADD_I
+#undef ALIAS
diff --git a/src/julia_internal.h b/src/julia_internal.h
index ea14b50079e83..d3798c1964f96 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -224,6 +224,98 @@ DLLEXPORT int jl_fs_rename(const char *src_path, const char *dst_path);
 extern DLLEXPORT jl_value_t *jl_segv_exception;
 #endif
 
+// Runtime intrinsics //
+
+DLLEXPORT jl_value_t *jl_reinterpret(jl_value_t *ty, jl_value_t *v);
+DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i);
+DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *i);
+
+DLLEXPORT jl_value_t *jl_neg_int(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_add_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_sub_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_mul_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_sdiv_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_udiv_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_srem_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_urem_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_smod_int(jl_value_t *a, jl_value_t *b);
+
+DLLEXPORT jl_value_t *jl_neg_float(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_add_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_sub_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_mul_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_div_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_rem_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_fma_float(jl_value_t *a, jl_value_t *b, jl_value_t *c);
+DLLEXPORT jl_value_t *jl_muladd_float(jl_value_t *a, jl_value_t *b, jl_value_t *c);
+
+DLLEXPORT jl_value_t *jl_eq_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_ne_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_slt_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_ult_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_sle_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_ule_int(jl_value_t *a, jl_value_t *b);
+
+DLLEXPORT jl_value_t *jl_eq_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_ne_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_lt_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_le_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_fpiseq(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_fpislt(jl_value_t *a, jl_value_t *b);
+
+DLLEXPORT jl_value_t *jl_not_int(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_and_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_or_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_xor_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_shl_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_lshr_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_ashr_int(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_bswap_int(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_ctpop_int(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_ctlz_int(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_cttz_int(jl_value_t *a);
+
+DLLEXPORT jl_value_t *jl_sext_int(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_zext_int(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_trunc_int(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_sitofp(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_uitofp(jl_value_t *ty, jl_value_t *a);
+
+DLLEXPORT jl_value_t *jl_fptoui(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_fptosi(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_fptrunc(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_fpext(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_fptoui_auto(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_fptosi_auto(jl_value_t *a);
+
+DLLEXPORT jl_value_t *jl_checked_fptoui(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_checked_fptosi(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_checked_trunc_sint(jl_value_t *ty, jl_value_t *a);
+DLLEXPORT jl_value_t *jl_checked_trunc_uint(jl_value_t *ty, jl_value_t *a);
+
+DLLEXPORT jl_value_t *jl_check_top_bit(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_checked_sadd(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_checked_uadd(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_checked_ssub(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_checked_usub(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_checked_smul(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_checked_umul(jl_value_t *a, jl_value_t *b);
+
+DLLEXPORT jl_value_t *jl_nan_dom_err(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_ceil_llvm(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_floor_llvm(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_trunc_llvm(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_rint_llvm(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_sqrt_llvm(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_abs_float(jl_value_t *a);
+DLLEXPORT jl_value_t *jl_copysign_float(jl_value_t *a, jl_value_t *b);
+DLLEXPORT jl_value_t *jl_flipsign_int(jl_value_t *a, jl_value_t *b);
+
+DLLEXPORT jl_value_t *jl_select_value(jl_value_t *isfalse, jl_value_t *a, jl_value_t *b);
+
+
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
new file mode 100644
index 0000000000000..bd2c296b99f87
--- /dev/null
+++ b/src/runtime_intrinsics.c
@@ -0,0 +1,930 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+//
+// This is in implementation of the Julia intrinsic functions against boxed types
+// excluding the c interface (ccall, cglobal, llvmcall)
+//
+// this file assumes a little-endian processor, although that isn't too hard to fix
+// it also assumes two's complement negative numbers, which might be a bit harder to fix
+//
+// TODO: add half-float support
+
+#include "julia.h"
+#include "julia_internal.h"
+#include "APInt-C.h"
+const unsigned int host_char_bit = 8;
+
+// run time version of box/unbox intrinsic
+DLLEXPORT jl_value_t *jl_reinterpret(jl_value_t *ty, jl_value_t *v)
+{
+    JL_TYPECHK(reinterpret, datatype, ty);
+    if (!jl_is_leaf_type(ty) || !jl_is_bitstype(ty))
+        jl_error("reinterpret: target type not a leaf bitstype");
+    if (!jl_is_bitstype(jl_typeof(v)))
+        jl_error("reinterpret: value not a bitstype");
+    if (jl_datatype_size(jl_typeof(v)) != jl_datatype_size(ty))
+        jl_error("reinterpret: argument size does not match size of target type");
+    if (ty == jl_typeof(v))
+        return v;
+    if (ty == (jl_value_t*)jl_bool_type)
+        return *(uint8_t*)jl_data_ptr(v) & 1 ? jl_true : jl_false;
+    return jl_new_bits(ty, jl_data_ptr(v));
+}
+
+// run time version of pointerref intrinsic (warning: i is not rooted)
+DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i)
+{
+    JL_TYPECHK(pointerref, pointer, p);
+    JL_TYPECHK(pointerref, long, i);
+    jl_value_t *ety = jl_tparam0(jl_typeof(p));
+    if (ety == (jl_value_t*)jl_any_type) {
+        jl_value_t **pp = (jl_value_t**)(jl_unbox_long(p) + (jl_unbox_long(i)-1)*sizeof(void*));
+        return *pp;
+    }
+    else {
+        if (!jl_is_datatype(ety))
+            jl_error("pointerref: invalid pointer");
+        size_t nb = LLT_ALIGN(jl_datatype_size(ety), ((jl_datatype_t*)ety)->alignment);
+        char *pp = (char*)jl_unbox_long(p) + (jl_unbox_long(i)-1)*nb;
+        return jl_new_bits(ety, pp);
+    }
+}
+
+// run time version of pointerset intrinsic (warning: x is not gc-rooted)
+DLLEXPORT jl_value_t *jl_pointerset(jl_value_t *p, jl_value_t *x, jl_value_t *i)
+{
+    JL_TYPECHK(pointerset, pointer, p);
+    JL_TYPECHK(pointerset, long, i);
+    jl_value_t *ety = jl_tparam0(jl_typeof(p));
+    if (ety == (jl_value_t*)jl_any_type) {
+        jl_value_t **pp = (jl_value_t**)(jl_unbox_long(p) + (jl_unbox_long(i)-1)*sizeof(void*));
+        *pp = x;
+    }
+    else {
+        if (!jl_is_datatype(ety))
+            jl_error("pointerset: invalid pointer");
+        size_t nb = LLT_ALIGN(jl_datatype_size(ety), ((jl_datatype_t*)ety)->alignment);
+        char *pp = (char*)jl_unbox_long(p) + (jl_unbox_long(i)-1)*nb;
+        if (jl_typeof(x) != ety)
+            jl_error("pointerset: type mismatch in assign");
+        jl_assign_bits(pp, x);
+    }
+    return p;
+}
+
+
+static inline unsigned int next_power_of_two(unsigned int val) {
+  /* this function taken from libuv src/unix/core.c */
+  val -= 1;
+  val |= val >> 1;
+  val |= val >> 2;
+  val |= val >> 4;
+  val |= val >> 8;
+  val |= val >> 16;
+  val += 1;
+  return val;
+}
+
+static inline char signbitbyte(void *a, unsigned bytes) {
+    // sign bit of an signed number of n bytes, as a byte
+    return signbit(((signed char*)a)[bytes-1]) ? ~0 : 0;
+}
+
+static inline char usignbitbyte(void *a, unsigned bytes) {
+    // sign bit of an unsigned number
+    return 0;
+}
+
+static inline unsigned select_by_size(unsigned sz)
+{
+    /* choose the right sized function specialization */
+    switch (sz) {
+    default: return 0;
+    case  1: return 1;
+    case  2: return 2;
+    case  4: return 3;
+    case  8: return 4;
+    case 16: return 5;
+    }
+}
+
+#define SELECTOR_FUNC(intrinsic) \
+    typedef intrinsic##_t select_##intrinsic##_t[6]; \
+    static inline intrinsic##_t select_##intrinsic(unsigned sz, select_##intrinsic##_t list) \
+    { \
+        return list[select_by_size(sz)] ?: list[0]; \
+    }
+
+#define fp_select(a, func) \
+    sizeof(a) == sizeof(float) ? func##f((float)a) : func(a)
+#define fp_select2(a, b, func) \
+    sizeof(a) == sizeof(float) ? func##f(a, b) : func(a, b)
+
+// fast-function generators //
+
+// integer input
+// OP::Function macro(input)
+// name::unique string
+// nbits::number of bits
+// c_type::c_type corresponding to nbits
+#define un_iintrinsic_ctype(OP, name, nbits, c_type) \
+static inline void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pr) \
+{ \
+    c_type a = *(c_type*)pa; \
+    *(c_type*)pr = OP(a); \
+}
+
+// integer input, unsigned output
+// OP::Function macro(input)
+// name::unique string
+// nbits::number of bits
+// c_type::c_type corresponding to nbits
+#define uu_iintrinsic_ctype(OP, name, nbits, c_type) \
+static inline unsigned jl_##name##nbits(unsigned runtime_nbits, void *pa) \
+{ \
+    c_type a = *(c_type*)pa; \
+    return OP(a); \
+}
+
+// floating point
+// OP::Function macro(output pointer, input)
+// name::unique string
+// nbits::number of bits in the *input*
+// c_type::c_type corresponding to nbits
+#define un_fintrinsic_ctype(OP, name, c_type) \
+static inline void name(unsigned osize, void *pa, void *pr) \
+{ \
+    c_type a = *(c_type*)pa; \
+    OP((c_type*)pr, a); \
+}
+
+// float or integer inputs
+// OP::Function macro(inputa, inputb)
+// name::unique string
+// nbits::number of bits
+// c_type::c_type corresponding to nbits
+#define bi_intrinsic_ctype(OP, name, nbits, c_type) \
+static void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *pr) \
+{ \
+    c_type a = *(c_type*)pa; \
+    c_type b = *(c_type*)pb; \
+    *(c_type*)pr = (c_type)OP(a, b); \
+}
+
+// float or integer inputs, bool output
+// OP::Function macro(inputa, inputb)
+// name::unique string
+// nbits::number of bits
+// c_type::c_type corresponding to nbits
+#define bool_intrinsic_ctype(OP, name, nbits, c_type) \
+static int jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb) \
+{ \
+    c_type a = *(c_type*)pa; \
+    c_type b = *(c_type*)pb; \
+    return OP(a, b); \
+}
+
+// integer inputs, with precondition test
+// OP::Function macro(inputa, inputb)
+// name::unique string
+// nbits::number of bits
+// c_type::c_type corresponding to nbits
+#define checked_intrinsic_ctype(CHECK_OP, OP, name, nbits, c_type) \
+static int jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *pr) \
+{ \
+    c_type a = *(c_type*)pa; \
+    c_type b = *(c_type*)pb; \
+    if (CHECK_OP(a, b)) \
+        return 1; \
+    *(c_type*)pr = (c_type)OP(a, b); \
+    return 0; \
+}
+
+// float inputs
+// OP::Function macro(inputa, inputb, inputc)
+// name::unique string
+// nbits::number of bits
+// c_type::c_type corresponding to nbits
+#define ter_intrinsic_ctype(OP, name, nbits, c_type) \
+static void jl_##name##nbits(unsigned runtime_nbits, void *pa, void *pb, void *pc, void *pr) \
+{ \
+    c_type a = *(c_type*)pa; \
+    c_type b = *(c_type*)pb; \
+    c_type c = *(c_type*)pc; \
+    *(c_type*)pr = (c_type)OP(a, b, c); \
+}
+
+
+// unary operator generator //
+
+typedef void (*intrinsic_1_t)(unsigned, void*, void*);
+SELECTOR_FUNC(intrinsic_1)
+#define un_iintrinsic(name, u) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
+{ \
+    return jl_iintrinsic_1(jl_typeof(a), a, #name, u##signbitbyte, jl_intrinsiclambda_ty1, name##_list); \
+}
+#define un_iintrinsic_fast(LLVMOP, OP, name, u) \
+un_iintrinsic_ctype(OP, name, 8, u##int##8_t) \
+un_iintrinsic_ctype(OP, name, 16, u##int##16_t) \
+un_iintrinsic_ctype(OP, name, 32, u##int##32_t) \
+un_iintrinsic_ctype(OP, name, 64, u##int##64_t) \
+static select_intrinsic_1_t name##_list = { \
+    LLVMOP, \
+    jl_##name##8, \
+    jl_##name##16, \
+    jl_##name##32, \
+    jl_##name##64, \
+}; \
+un_iintrinsic(name, u)
+#define un_iintrinsic_slow(LLVMOP, name, u) \
+static select_intrinsic_1_t name##_list = { \
+    LLVMOP \
+}; \
+un_iintrinsic(name, u)
+
+typedef unsigned (*intrinsic_u1_t)(unsigned, void*);
+SELECTOR_FUNC(intrinsic_u1)
+#define uu_iintrinsic(name, u) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
+{ \
+    return jl_iintrinsic_1(jl_typeof(a), a, #name, u##signbitbyte, jl_intrinsiclambda_u1, name##_list); \
+}
+#define uu_iintrinsic_fast(LLVMOP, OP, name, u) \
+uu_iintrinsic_ctype(OP, name, 8, u##int##8_t) \
+uu_iintrinsic_ctype(OP, name, 16, u##int##16_t) \
+uu_iintrinsic_ctype(OP, name, 32, u##int##32_t) \
+uu_iintrinsic_ctype(OP, name, 64, u##int##64_t) \
+static select_intrinsic_u1_t name##_list = { \
+    LLVMOP, \
+    jl_##name##8, \
+    jl_##name##16, \
+    jl_##name##32, \
+    jl_##name##64, \
+}; \
+uu_iintrinsic(name, u)
+#define uu_iintrinsic_slow(LLVMOP, name, u) \
+static select_intrinsic_u1_t name##_list = { \
+    LLVMOP \
+}; \
+uu_iintrinsic(name, u)
+
+static inline jl_value_t *jl_iintrinsic_1(jl_value_t *ty, jl_value_t *a, const char *name, char (*getsign)(void*, unsigned),
+        jl_value_t* (*lambda1)(jl_value_t*, void*, unsigned, unsigned, void*), void *list)
+{
+    if (!jl_is_bitstype(jl_typeof(a)))
+        jl_errorf("%s: value is not a bitstype", name);
+    if (!jl_is_bitstype(ty))
+        jl_errorf("%s: type is not a bitstype", name);
+    void *pa = jl_data_ptr(a);
+    unsigned isize = jl_datatype_size(jl_typeof(a));
+    unsigned isize2 = next_power_of_two(isize);
+    unsigned osize = jl_datatype_size(ty);
+    unsigned osize2 = next_power_of_two(osize);
+    if (isize2 > osize2)
+        osize2 = isize2;
+    if (osize2 > isize || isize2 > isize) {
+        /* if needed, round type up to a real c-type and set/clear the unused bits */
+        void *pa2;
+        pa2 = alloca(osize2);
+        /* TODO: this memcpy assumes little-endian,
+         * for big-endian, need to align the copy to the other end */ \
+        memcpy(pa2, pa, isize);
+        memset(pa2 + isize, getsign(pa, isize), osize2 - isize);
+        pa = pa2;
+    }
+    jl_value_t *newv = lambda1(ty, pa, osize, osize2, list);
+    if (ty == (jl_value_t*)jl_bool_type)
+        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
+    return newv;
+}
+
+static inline jl_value_t *jl_intrinsiclambda_ty1(jl_value_t *ty, void *pa, unsigned osize, unsigned osize2, void *voidlist)
+{
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    intrinsic_1_t op = select_intrinsic_1(osize2, (intrinsic_1_t*)voidlist);
+    op(osize * host_char_bit, pa, jl_data_ptr(newv));
+    return newv;
+}
+
+static inline jl_value_t *jl_intrinsiclambda_u1(jl_value_t *ty, void *pa, unsigned osize, unsigned osize2, void *voidlist)
+{
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    intrinsic_u1_t op = select_intrinsic_u1(osize2, (intrinsic_u1_t*)voidlist);
+    unsigned cnt = op(osize * host_char_bit, pa);
+    // TODO: the following memset/memcpy assumes little-endian
+    // for big-endian, need to copy from the other end of cnt
+    if (osize > sizeof(unsigned)) {
+        // perform zext, if needed
+        memset((char*)jl_data_ptr(newv) + sizeof(unsigned), 0, osize - sizeof(unsigned));
+        osize = sizeof(unsigned);
+    }
+    memcpy(jl_data_ptr(newv), &cnt, osize);
+    return newv;
+}
+
+// conversion operator
+
+typedef void (*intrinsic_cvt_t)(unsigned, void*, unsigned, void*);
+typedef unsigned (*intrinsic_cvt_check_t)(unsigned, unsigned, void*);
+#define cvt_iintrinsic_checked(LLVMOP, check_op, name) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *ty, jl_value_t *a) \
+{ \
+    return jl_intrinsic_cvt(ty, a, #name, LLVMOP, check_op); \
+}
+#define cvt_iintrinsic(LLVMOP, name) \
+    cvt_iintrinsic_checked(LLVMOP, NULL, name) \
+
+static inline jl_value_t *jl_intrinsic_cvt(jl_value_t *ty, jl_value_t *a, const char *name, intrinsic_cvt_t op, intrinsic_cvt_check_t check_op)
+{
+    jl_value_t *aty = jl_typeof(a);
+    if (!jl_is_bitstype(aty))
+        jl_errorf("%s: value is not a bitstype", name);
+    if (!jl_is_bitstype(ty))
+        jl_errorf("%s: type is not a bitstype", name);
+    void *pa = jl_data_ptr(a);
+    unsigned isize = jl_datatype_size(aty);
+    unsigned osize = jl_datatype_size(ty);
+    if (check_op && check_op(isize, osize, pa))
+        jl_throw(jl_inexact_exception);
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    op(aty == (jl_value_t*)jl_bool_type ? 1 : isize * host_char_bit, pa,
+            osize * host_char_bit, jl_data_ptr(newv));
+    if (ty == (jl_value_t*)jl_bool_type)
+        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
+    return newv;
+}
+
+// floating point
+
+#define un_fintrinsic_withtype(OP, name) \
+un_fintrinsic_ctype(OP, jl_##name##32, float) \
+un_fintrinsic_ctype(OP, jl_##name##64, double) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *ty, jl_value_t *a) \
+{ \
+    return jl_fintrinsic_1(ty, a, #name, jl_##name##32, jl_##name##64); \
+}
+
+#define un_fintrinsic(OP, name) \
+un_fintrinsic_withtype(OP, name##_withtype) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a) \
+{ \
+    return jl_##name##_withtype(jl_typeof(a), a); \
+}
+
+typedef void (fintrinsic_op1)(unsigned, void*, void*);
+
+static inline jl_value_t *jl_fintrinsic_1(jl_value_t *ty, jl_value_t *a, const char *name, fintrinsic_op1 *floatop, fintrinsic_op1 *doubleop)
+{
+    if (!jl_is_bitstype(jl_typeof(a)))
+        jl_errorf("%s: value is not a bitstype", name);
+    if (!jl_is_bitstype(ty))
+        jl_errorf("%s: type is not a bitstype", name);
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    void *pa = jl_data_ptr(a), *pr = jl_data_ptr(newv);
+    unsigned sz = jl_datatype_size(jl_typeof(a));
+    unsigned sz2 = jl_datatype_size(ty);
+    switch (sz) {
+    /* choose the right size c-type operation based on the input */
+    case 4:
+        floatop(sz2 * host_char_bit, pa, pr);
+        break;
+    case 8:
+        doubleop(sz2 * host_char_bit, pa, pr);
+        break;
+    default:
+        jl_errorf("%s: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64", name);
+    }
+    return newv;
+}
+
+// binary operator generator //
+
+// integer
+
+typedef void (*intrinsic_2_t)(unsigned, void*, void*, void*);
+SELECTOR_FUNC(intrinsic_2)
+#define bi_iintrinsic(name, u, cvtb) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+{ \
+    return jl_iintrinsic_2(a, b, #name, u##signbitbyte, jl_intrinsiclambda_2, name##_list, cvtb); \
+}
+#define bi_iintrinsic_cnvtb_fast(LLVMOP, OP, name, u, cvtb) \
+bi_intrinsic_ctype(OP, name, 8, u##int##8_t) \
+bi_intrinsic_ctype(OP, name, 16, u##int##16_t) \
+bi_intrinsic_ctype(OP, name, 32, u##int##32_t) \
+bi_intrinsic_ctype(OP, name, 64, u##int##64_t) \
+static select_intrinsic_2_t name##_list = { \
+    LLVMOP, \
+    jl_##name##8, \
+    jl_##name##16, \
+    jl_##name##32, \
+    jl_##name##64, \
+}; \
+bi_iintrinsic(name, u, cvtb)
+#define bi_iintrinsic_fast(LLVMOP, OP, name, u) \
+    bi_iintrinsic_cnvtb_fast(LLVMOP, OP, name, u, 0)
+
+typedef int (*intrinsic_cmp_t)(unsigned, void*, void*);
+SELECTOR_FUNC(intrinsic_cmp)
+#define cmp_iintrinsic(name, u) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+{ \
+    return jl_iintrinsic_2(a, b, #name, u##signbitbyte, jl_intrinsiclambda_cmp, name##_list, 0); \
+}
+#define bool_iintrinsic_fast(LLVMOP, OP, name, u) \
+bool_intrinsic_ctype(OP, name, 8, u##int##8_t) \
+bool_intrinsic_ctype(OP, name, 16, u##int##16_t) \
+bool_intrinsic_ctype(OP, name, 32, u##int##32_t) \
+bool_intrinsic_ctype(OP, name, 64, u##int##64_t) \
+static select_intrinsic_cmp_t name##_list = { \
+    LLVMOP, \
+    jl_##name##8, \
+    jl_##name##16, \
+    jl_##name##32, \
+    jl_##name##64, \
+}; \
+cmp_iintrinsic(name, u)
+
+typedef int (*intrinsic_checked_t)(unsigned, void*, void*, void*);
+SELECTOR_FUNC(intrinsic_checked)
+#define checked_iintrinsic(name, u) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+{ \
+    return jl_iintrinsic_2(a, b, #name, u##signbitbyte, jl_intrinsiclambda_checked, name##_list, 0); \
+}
+#define checked_iintrinsic_fast(LLVMOP, CHECK_OP, OP, name, u) \
+checked_intrinsic_ctype(CHECK_OP, OP, name, 8, u##int##8_t) \
+checked_intrinsic_ctype(CHECK_OP, OP, name, 16, u##int##16_t) \
+checked_intrinsic_ctype(CHECK_OP, OP, name, 32, u##int##32_t) \
+checked_intrinsic_ctype(CHECK_OP, OP, name, 64, u##int##64_t) \
+static select_intrinsic_checked_t name##_list = { \
+    LLVMOP, \
+    jl_##name##8, \
+    jl_##name##16, \
+    jl_##name##32, \
+    jl_##name##64, \
+}; \
+checked_iintrinsic(name, u)
+#define checked_iintrinsic_slow(LLVMOP, name, u) \
+static select_intrinsic_checked_t name##_list = { \
+    LLVMOP \
+}; \
+checked_iintrinsic(name, u)
+
+static inline jl_value_t *jl_iintrinsic_2(jl_value_t *a, jl_value_t *b, const char *name, char (*getsign)(void*, unsigned),
+        jl_value_t* (*lambda2)(jl_value_t*, void*, void*, unsigned, unsigned, void*),
+        void *list, int cvtb)
+{
+    jl_value_t *ty = jl_typeof(a);
+    jl_value_t *tyb = jl_typeof(b);
+    if (tyb != ty) {
+        if (!cvtb)
+            jl_errorf("%s: types of a and b must match", name);
+        if (!jl_is_bitstype(tyb))
+            jl_errorf("%s: b is not a bitstypes", name);
+    }
+    if (!jl_is_bitstype(ty))
+        jl_errorf("%s: a is not a bitstypes", name);
+    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b);
+    unsigned sz = jl_datatype_size(ty);
+    unsigned sz2 = next_power_of_two(sz);
+    unsigned szb = jl_datatype_size(tyb);
+    if (sz2 > sz) {
+        /* round type up to the appropriate c-type and set/clear the unused bits */
+        void *pa2 = alloca(sz2);
+        memcpy(pa2, pa, sz);
+        memset((char*)pa2 + sz, getsign(pa, sz), sz2 - sz);
+        pa = pa2;
+    }
+    if (sz2 > szb) {
+        /* round type up to the appropriate c-type and set/clear/truncate the unused bits */
+        void *pb2 = alloca(sz2);
+        memcpy(pb2, pb, szb);
+        memset((char*)pb2 + szb, getsign(pb, sz), sz2 - szb);
+        pb = pb2;
+    }
+    jl_value_t *newv = lambda2(ty, pa, pb, sz, sz2, list);
+    return newv;
+}
+
+static inline jl_value_t *jl_intrinsiclambda_2(jl_value_t *ty, void *pa, void *pb, unsigned sz, unsigned sz2, void *voidlist)
+{
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    intrinsic_2_t op = select_intrinsic_2(sz2, (intrinsic_2_t*)voidlist);
+    op(sz * host_char_bit, pa, pb, jl_data_ptr(newv));
+    if (ty == (jl_value_t*)jl_bool_type)
+        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
+    return newv;
+}
+
+static inline jl_value_t *jl_intrinsiclambda_cmp(jl_value_t *ty, void *pa, void *pb, unsigned sz, unsigned sz2, void *voidlist)
+{
+    intrinsic_cmp_t op = select_intrinsic_cmp(sz2, (intrinsic_cmp_t*)voidlist);
+    int cmp = op(sz * host_char_bit, pa, pb);
+    return cmp ? jl_true : jl_false;
+}
+
+static inline jl_value_t *jl_intrinsiclambda_checked(jl_value_t *ty, void *pa, void *pb, unsigned sz, unsigned sz2, void *voidlist)
+{
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    intrinsic_checked_t op = select_intrinsic_checked(sz2, (intrinsic_checked_t*)voidlist);
+    int ovflw = op(sz * host_char_bit, pa, pb, jl_data_ptr(newv));
+    if (ovflw)
+        jl_throw(jl_overflow_exception);
+    if (ty == (jl_value_t*)jl_bool_type)
+        return *(uint8_t*)jl_data_ptr(newv) & 1 ? jl_true : jl_false;
+    return newv;
+}
+
+// floating point
+
+#define bi_fintrinsic(OP, name) \
+    bi_intrinsic_ctype(OP, name, 32, float) \
+    bi_intrinsic_ctype(OP, name, 64, double) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+{ \
+    jl_value_t *ty = jl_typeof(a); \
+    if (jl_typeof(b) != ty) \
+        jl_error(#name ": types of a and b must match"); \
+    if (!jl_is_bitstype(ty)) \
+        jl_error(#name ": values are not bitstypes"); \
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty); \
+    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pr = jl_data_ptr(newv); \
+    int sz = jl_datatype_size(ty); \
+    switch (sz) { \
+    /* choose the right size c-type operation */ \
+    case 4: \
+        jl_##name##32(32, pa, pb, pr); \
+        break; \
+    case 8: \
+        jl_##name##64(64, pa, pb, pr); \
+        break; \
+    default: \
+        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
+    } \
+    return newv; \
+}
+
+#define bool_fintrinsic(OP, name) \
+    bool_intrinsic_ctype(OP, name, 32, float) \
+    bool_intrinsic_ctype(OP, name, 64, double) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b) \
+{ \
+    jl_value_t *ty = jl_typeof(a); \
+    if (jl_typeof(b) != ty) \
+        jl_error(#name ": types of a and b must match"); \
+    if (!jl_is_bitstype(ty)) \
+        jl_error(#name ": values are not bitstypes"); \
+    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b); \
+    int sz = jl_datatype_size(ty); \
+    int cmp; \
+    switch (sz) { \
+    /* choose the right size c-type operation */ \
+    case 4: \
+        cmp = jl_##name##32(32, pa, pb); \
+        break; \
+    case 8: \
+        cmp = jl_##name##64(64, pa, pb); \
+        break; \
+    default: \
+        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
+    } \
+    return cmp ? jl_true : jl_false; \
+}
+
+#define ter_fintrinsic(OP, name) \
+    ter_intrinsic_ctype(OP, name, 32, float) \
+    ter_intrinsic_ctype(OP, name, 64, double) \
+DLLEXPORT jl_value_t *jl_##name(jl_value_t *a, jl_value_t *b, jl_value_t *c) \
+{ \
+    jl_value_t *ty = jl_typeof(a); \
+    if (jl_typeof(b) != ty || jl_typeof(c) != ty) \
+        jl_error(#name ": types of a, b, and c must match"); \
+    if (!jl_is_bitstype(ty)) \
+        jl_error(#name ": values are not bitstypes"); \
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty); \
+    void *pa = jl_data_ptr(a), *pb = jl_data_ptr(b), *pc = jl_data_ptr(c), *pr = jl_data_ptr(newv); \
+    int sz = jl_datatype_size(ty); \
+    switch (sz) { \
+    /* choose the right size c-type operation */ \
+    case 4: \
+        jl_##name##32(32, pa, pb, pc, pr); \
+        break; \
+    case 8: \
+        jl_##name##64(64, pa, pb, pc, pr); \
+        break; \
+    default: \
+        jl_error(#name ": runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64"); \
+    } \
+    return newv; \
+}
+
+// arithmetic
+#define neg(a) -a
+#define neg_float(pr, a) *pr = -a
+un_iintrinsic_fast(LLVMNeg, neg, neg_int, u)
+#define add(a,b) a + b
+bi_iintrinsic_fast(LLVMAdd, add, add_int, u)
+#define sub(a,b) a - b
+bi_iintrinsic_fast(LLVMSub, sub, sub_int, u)
+#define mul(a,b) a * b
+bi_iintrinsic_fast(LLVMMul, mul, mul_int, u)
+#define div(a,b) a / b
+bi_iintrinsic_fast(LLVMSDiv, div, sdiv_int,  )
+bi_iintrinsic_fast(LLVMUDiv, div, udiv_int, u)
+#define rem(a,b) a % b
+bi_iintrinsic_fast(LLVMSRem, rem, srem_int,  )
+bi_iintrinsic_fast(LLVMURem, rem, urem_int, u)
+#define smod(a,b) ((a < 0) == (b < 0)) ? a % b : (b + (a % b)) % b
+bi_iintrinsic_fast(jl_LLVMSMod, smod, smod_int,  )
+#define frem(a, b) \
+    fp_select2(a, b, fmod)
+
+un_fintrinsic(neg_float,neg_float)
+bi_fintrinsic(add,add_float)
+bi_fintrinsic(sub,sub_float)
+bi_fintrinsic(mul,mul_float)
+bi_fintrinsic(div,div_float)
+bi_fintrinsic(frem,rem_float)
+
+// ternary operators //
+#define fma(a, b, c) \
+    sizeof(a) == sizeof(float) ? fmaf(a, b, c) : fma(a, b, c)
+#define muladd(a, b, c) a * b + c
+ter_fintrinsic(fma,fma_float)
+ter_fintrinsic(muladd,muladd_float)
+
+// same-type comparisons
+#define eq(a,b) a == b
+bool_iintrinsic_fast(LLVMICmpEQ, eq, eq_int, u)
+#define ne(a,b) a != b
+bool_iintrinsic_fast(LLVMICmpNE, ne, ne_int, u)
+#define lt(a,b) a < b
+bool_iintrinsic_fast(LLVMICmpSLT, lt, slt_int,  )
+bool_iintrinsic_fast(LLVMICmpULT, lt, ult_int, u)
+#define le(a,b) a <= b
+bool_iintrinsic_fast(LLVMICmpSLE, le, sle_int,  )
+bool_iintrinsic_fast(LLVMICmpULE, le, ule_int, u)
+
+typedef union {
+    float f;
+    int32_t d;
+    uint32_t ud;
+} bits32;
+typedef union {
+    double f;
+    int64_t d;
+    uint64_t ud;
+} bits64;
+
+#define fpiseq_n(c_type, nbits) \
+static inline int fpiseq##nbits(c_type a, c_type b) { \
+    bits##nbits ua, ub; \
+    ua.f = a; \
+    ub.f = b; \
+    return (isnan(a) && isnan(b)) || ua.d == ub.d; \
+}
+fpiseq_n(float, 32)
+fpiseq_n(double, 64)
+#define fpiseq(a,b) \
+    sizeof(a) == sizeof(float) ? fpiseq32(a, b) : fpiseq64(a, b)
+
+#define fpislt_n(c_type, nbits) \
+static inline int fpislt##nbits(c_type a, c_type b) { \
+    bits##nbits ua, ub; \
+    ua.f = a; \
+    ub.f = b; \
+    if (!isnan(a) && isnan(b)) \
+        return 1; \
+    if (isnan(a) || isnan(b)) \
+        return 0; \
+    if (ua.d >= 0 && ua.d < ub.d) \
+        return 1; \
+    if (ua.d < 0 && ua.ud > ub.ud) \
+        return 1; \
+    return 0; \
+}
+fpislt_n(float, 32)
+fpislt_n(double, 64)
+#define fpislt(a, b) \
+    sizeof(a) == sizeof(float) ? fpislt32(a, b) : fpislt64(a, b)
+
+bool_fintrinsic(eq,eq_float)
+bool_fintrinsic(ne,ne_float)
+bool_fintrinsic(lt,lt_float)
+bool_fintrinsic(le,le_float)
+bool_fintrinsic(fpiseq,fpiseq)
+bool_fintrinsic(fpislt,fpislt)
+
+// bitwise operators
+#define and(a,b) a & b
+bi_iintrinsic_fast(LLVMAnd, and, and_int, u)
+#define or(a,b) a | b
+bi_iintrinsic_fast(LLVMOr, or, or_int, u)
+#define xor(a,b) a ^ b
+bi_iintrinsic_fast(LLVMXor, xor, xor_int, u)
+#define shl(a,b) b >= 8 * sizeof(a) ? 0 : a << b
+bi_iintrinsic_cnvtb_fast(LLVMShl, shl, shl_int, u, 1)
+#define lshr(a,b) (b >= 8 * sizeof(a)) ? 0 : a >> b
+bi_iintrinsic_cnvtb_fast(LLVMLShr, lshr, lshr_int, u, 1)
+#define ashr(a,b) \
+        /* if ((signed)a > 0) [in two's complement] ? ... : ...) */ \
+        (a >> (host_char_bit * sizeof(a) - 1)) ? ~(b >= 8 * sizeof(a) ? 0 : (~a) >> b) : (b >= 8 * sizeof(a) ? 0 : a >> b)
+bi_iintrinsic_cnvtb_fast(LLVMAShr, ashr, ashr_int, u, 1)
+//#define bswap(a) __builtin_bswap(a)
+//un_iintrinsic_fast(LLVMByteSwap, bswap, bswap_int, u)
+un_iintrinsic_slow(LLVMByteSwap, bswap_int, u)
+//#define ctpop(a) __builtin_ctpop(a)
+//uu_iintrinsic_fast(LLVMCountPopulation, ctpop, ctpop_int, u)
+uu_iintrinsic_slow(LLVMCountPopulation, ctpop_int, u)
+//#define ctlz(a) __builtin_ctlz(a)
+//uu_iintrinsic_fast(LLVMCountLeadingZeros, ctlz_int, u)
+uu_iintrinsic_slow(LLVMCountLeadingZeros, ctlz_int, u)
+//#define cttz(a) __builtin_cttz(a)
+//uu_iintrinsic_fast(LLVMCountTrailingZeros, cttz, cttz_int, u)
+uu_iintrinsic_slow(LLVMCountTrailingZeros, cttz_int, u)
+#define not(a) ~a
+un_iintrinsic_fast(LLVMFlipAllBits, not, not_int, u)
+
+// conversions
+cvt_iintrinsic(LLVMTrunc, trunc_int)
+cvt_iintrinsic(LLVMSExt, sext_int)
+cvt_iintrinsic(LLVMZExt, zext_int)
+cvt_iintrinsic(LLVMSItoFP, sitofp)
+cvt_iintrinsic(LLVMUItoFP, uitofp)
+cvt_iintrinsic(LLVMFPtoSI, fptosi)
+cvt_iintrinsic(LLVMFPtoUI, fptoui)
+
+#define fpcvt(pr, a) \
+        if (osize == 32) \
+            *(float*)pr = a; \
+        else if (osize == 64) \
+            *(double*)pr = a; \
+        else \
+            jl_error("fptrunc/fpext: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+un_fintrinsic_withtype(fpcvt,fptrunc)
+un_fintrinsic_withtype(fpcvt,fpext)
+
+DLLEXPORT jl_value_t *jl_fptoui_auto(jl_value_t *a)
+{
+    jl_datatype_t *ty;
+    switch (jl_datatype_size(jl_typeof(a))) {
+        case 4:
+            ty = jl_uint32_type;
+            break;
+        case 8:
+            ty = jl_uint64_type;
+            break;
+        default:
+            jl_error("fptoui: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+    }
+    return jl_fptoui((jl_value_t*)ty, a);
+}
+DLLEXPORT jl_value_t *jl_fptosi_auto(jl_value_t *a)
+{
+    jl_datatype_t *ty;
+    switch (jl_datatype_size(jl_typeof(a))) {
+        case 4:
+            ty = jl_int32_type;
+            break;
+        case 8:
+            ty = jl_int64_type;
+            break;
+        default:
+            jl_error("fptoui: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+    }
+    return jl_fptosi((jl_value_t*)ty, a);
+}
+
+// checked conversion
+static inline int all_eq(char *p, char n, char v)
+{
+    // computes p[0:n] == v
+    while (n--)
+        if (*p++ != v)
+            return 0;
+    return 1;
+}
+static unsigned check_trunc_sint(unsigned isize, unsigned osize, void *pa)
+{
+    return !all_eq((char*)pa + osize, isize - osize, signbitbyte(pa, isize)); // TODO: assumes little-endian
+}
+cvt_iintrinsic_checked(LLVMTrunc, check_trunc_sint, checked_trunc_sint)
+static unsigned check_trunc_uint(unsigned isize, unsigned osize, void *pa)
+{
+    return !all_eq((char*)pa + osize, isize - osize, 0); // TODO: assumes little-endian
+}
+cvt_iintrinsic_checked(LLVMTrunc, check_trunc_uint, checked_trunc_uint)
+
+#define checked_fptosi(pr, a) \
+        if (!LLVMFPtoSI_exact(sizeof(a) * host_char_bit, pa, osize, pr)) \
+            jl_throw(jl_inexact_exception);
+un_fintrinsic_withtype(checked_fptosi, checked_fptosi)
+#define checked_fptoui(pr, a) \
+        if (!LLVMFPtoUI_exact(sizeof(a) * host_char_bit, pa, osize, pr)) \
+            jl_throw(jl_inexact_exception);
+un_fintrinsic_withtype(checked_fptoui, checked_fptoui)
+
+DLLEXPORT jl_value_t *jl_check_top_bit(jl_value_t *a)
+{
+    jl_value_t *ty = jl_typeof(a);
+    if (!jl_is_bitstype(ty))
+        jl_error("check_top_bit: value is not a bitstype");
+    if (signbitbyte(jl_data_ptr(a), jl_datatype_size(ty)))
+        jl_throw(jl_inexact_exception);
+    return a;
+}
+
+// checked arithmetic
+#define check_sadd(a,b) \
+        /* this test is a reduction of (b > 0) ? (a + b >= typemin(a)) : (a + b < typemin(a)) ==> overflow */ \
+        (b > 0) == (a >= (((typeof(a))1) << (8 * sizeof(a) - 1)) - b)
+checked_iintrinsic_fast(LLVMAdd_sov, check_sadd, add, checked_sadd,  )
+#define check_uadd(a,b) \
+        /* this test checks for (a + b) > typemax(a) ==> overflow */ \
+        a >= -b
+checked_iintrinsic_fast(LLVMAdd_uov, check_uadd, add, checked_uadd, u)
+#define check_ssub(a,b) check_sadd(a,-b)
+checked_iintrinsic_fast(LLVMSub_sov, check_ssub, sub, checked_ssub,  )
+#define check_usub(a,b) \
+        /* this test checks for (a - b) < 0 ==> overflow */ \
+        a < b
+checked_iintrinsic_fast(LLVMSub_uov, check_usub, sub, checked_usub, u)
+checked_iintrinsic_slow(LLVMMul_sov, checked_smul,  )
+checked_iintrinsic_slow(LLVMMul_uov, checked_umul, u)
+
+DLLEXPORT jl_value_t *jl_nan_dom_err(jl_value_t *a, jl_value_t *b)
+{
+    jl_value_t *ty = jl_typeof(a);
+    if (jl_typeof(b) != ty)
+        jl_error("nan_dom_err: types of a and b must match");
+    if (!jl_is_bitstype(ty))
+        jl_error("nan_dom_err: values are not bitstypes");
+    switch (jl_datatype_size(ty)) {
+        case 4:
+            if (isnan(*(float*)a) && !isnan(*(float*)b))
+                jl_throw(jl_domain_exception);
+            break;
+        case 8:
+            if (isnan(*(double*)a) && !isnan(*(double*)b))
+                jl_throw(jl_domain_exception);
+            break;
+        default:
+            jl_error("nan_dom_err: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+    }
+    return a;
+}
+
+// functions
+#define flipsign(a, b) \
+        (b >= 0) ? a : -a
+bi_iintrinsic_fast(jl_LLVMFlipSign, flipsign, flipsign_int,  )
+#define abs_float(pr, a) *pr = fp_select(a, fabs)
+#define ceil_float(pr, a) *pr = fp_select(a, ceil)
+#define floor_float(pr, a) *pr = fp_select(a, floor)
+#define trunc_float(pr, a) *pr = fp_select(a, trunc)
+#define rint_float(pr, a) *pr = fp_select(a, rint)
+#define sqrt_float(pr, a) \
+        if (a < 0) \
+            jl_throw(jl_domain_exception); \
+        *pr = fp_select(a, sqrt)
+#define copysign_float(a, b) \
+        fp_select2(a, b, copysign)
+
+un_fintrinsic(abs_float,abs_float)
+bi_fintrinsic(copysign_float,copysign_float)
+un_fintrinsic(ceil_float,ceil_llvm)
+un_fintrinsic(floor_float,floor_llvm)
+un_fintrinsic(trunc_float,trunc_llvm)
+un_fintrinsic(rint_float,rint_llvm)
+un_fintrinsic(sqrt_float,sqrt_llvm)
+
+DLLEXPORT jl_value_t *jl_powi_llvm(jl_value_t *a, jl_value_t *b)
+{
+    jl_value_t *ty = jl_typeof(a);
+    if (!jl_is_bitstype(ty))
+        jl_error("powi_llvm: a is not a bitstype");
+    if (!jl_is_bitstype(jl_typeof(b)) || jl_datatype_size(jl_typeof(b)) != 4)
+        jl_error("powi_llvm: b is not a 32-bit bitstype");
+    jl_value_t *newv = newstruct((jl_datatype_t*)ty);
+    void *pa = jl_data_ptr(a), *pr = jl_data_ptr(newv);
+    int sz = jl_datatype_size(ty);
+    switch (sz) {
+    /* choose the right size c-type operation */
+    case 4:
+        *(float*)pr = powf(*(float*)pa, (float)jl_unbox_int32(b));
+        break;
+    case 8:
+        *(double*)pr = pow(*(double*)pa, (double)jl_unbox_int32(b));
+        break;
+    default:
+        jl_error("powi_llvm: runtime floating point intrinsics are not implemented for bit sizes other than 32 and 64");
+    }
+    return newv;
+}
+
+DLLEXPORT jl_value_t *jl_select_value(jl_value_t *isfalse, jl_value_t *a, jl_value_t *b)
+{
+    JL_TYPECHK(isfalse, bool, isfalse);
+    return (isfalse == jl_false ? b : a);
+}

From 971747232bbc5f5846486e8add2e3bcea6dcb988 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Mon, 10 Aug 2015 00:04:34 -0400
Subject: [PATCH 02/11] wip: cleanup intrinsics

---
 src/intrinsics.cpp | 178 ++++++++++++++++++++++-----------------------
 1 file changed, 87 insertions(+), 91 deletions(-)

diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index d8db10bfe7288..e92583d126fe6 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -330,7 +330,7 @@ static Value *emit_unbox(Type *to, const jl_cgval_t &x, jl_value_t *jt)
 }
 
 // unbox, trying to determine correct bitstype automatically
-// returns some sort of raw, unboxed numeric type (in registers)
+// returns some sort of raw, unboxed numeric type (e.g. in registers)
 static Value *auto_unbox(const jl_cgval_t &v, jl_codectx_t *ctx)
 {
     jl_value_t *bt = v.typ;
@@ -831,9 +831,6 @@ static Value *emit_smod(Value *x, Value *den, jl_codectx_t *ctx)
     return ret;
 }
 
-#define HANDLE(intr,n)                                                  \
-    case intr:
-
 static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z, size_t nargs,
                                        jl_codectx_t *ctx, jl_datatype_t* *newtyp);
 static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
@@ -857,20 +854,21 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
     default:
         int ldepth = ctx->gc.argDepth;
         Value *r;
+        Value *func = prepare_call(runtime_func[f]);
         if (nargs == 1) {
             Value *x = emit_boxed_rooted(args[1], ctx).V;
-            r = builder.CreateCall(runtime_func[f], x);
+            r = builder.CreateCall(func, x);
         }
         else if (nargs == 2) {
             Value *x = emit_boxed_rooted(args[1], ctx).V;
             Value *y = emit_boxed_rooted(args[2], ctx).V;
-            r = builder.CreateCall2(runtime_func[f], x, y);
+            r = builder.CreateCall2(func, x, y);
         }
         else if (nargs == 3) {
             Value *x = emit_boxed_rooted(args[1], ctx).V;
             Value *y = emit_boxed_rooted(args[2], ctx).V;
             Value *z = emit_boxed_rooted(args[3], ctx).V;
-            r = builder.CreateCall3(runtime_func[f], x, y, z);
+            r = builder.CreateCall3(func, x, y, z);
         }
         else {
             assert(0);
@@ -886,22 +884,22 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return generic_box(args[1], args[2], ctx);
     case unbox:
         return generic_box(args[1], args[2], ctx);
-    HANDLE(trunc_int,2)
+    case trunc_int:
         return generic_trunc(args[1], args[2], ctx, false, false);
-    HANDLE(checked_trunc_sint,2)
+    case checked_trunc_sint:
         return generic_trunc(args[1], args[2], ctx, true, true);
-    HANDLE(checked_trunc_uint,2)
+    case checked_trunc_uint:
         return generic_trunc(args[1], args[2], ctx, true, false);
-    HANDLE(sext_int,2)
+    case sext_int:
         return generic_sext(args[1], args[2], ctx);
-    HANDLE(zext_int,2)
+    case zext_int:
         return generic_zext(args[1], args[2], ctx);
-    HANDLE(checked_fptosi,2)
+    case checked_fptosi:
         return emit_checked_fptosi(args[1], args[2], ctx);
-    HANDLE(checked_fptoui,2)
+    case checked_fptoui:
         return emit_checked_fptoui(args[1], args[2], ctx);
 
-    HANDLE(uitofp,2) {
+    case uitofp: {
         jl_value_t *bt = staticeval_bitstype(args[1], "uitofp", ctx);
         if (!bt) return jl_cgval_t();
         int nb = get_bitstype_nbits(bt);
@@ -910,7 +908,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return mark_julia_type(builder.CreateUIToFP(xi, FTnbits(nb)), false, bt);
     }
 
-    HANDLE(sitofp,2) {
+    case sitofp: {
         jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
         if (!bt) return jl_cgval_t();
         int nb = get_bitstype_nbits(bt);
@@ -952,7 +950,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return mark_julia_type(builder.CreateFPToSI(xf, Type::getIntNTy(jl_LLVMContext, nb)), false, bt);
     }
 
-    HANDLE(fptrunc,2) {
+    case fptrunc: {
         jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
         if (!bt) return jl_cgval_t();
         int nb = get_bitstype_nbits(bt);
@@ -961,7 +959,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return mark_julia_type(builder.CreateFPTrunc(xf, FTnbits(nb)), false, bt);
     }
 
-    HANDLE(fpext,2) {
+    case fpext: {
         jl_value_t *bt = staticeval_bitstype(args[1], "sitofp", ctx);
         if (!bt) return jl_cgval_t();
         int nb = get_bitstype_nbits(bt);
@@ -980,7 +978,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         return mark_julia_type(builder.CreateFPExt(x, FTnbits(nb)), false, bt);
     }
 
-    HANDLE(select_value,3) {
+    case select_value: {
         Value *isfalse = emit_condition(args[1], "select_value", ctx); // emit the first argument
         jl_value_t *t1 = expr_type(args[2], ctx);
         jl_value_t *t2 = expr_type(args[3], ctx);
@@ -1067,11 +1065,11 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
     Value *den;
     Value *typemin;
     switch (f) {
-    HANDLE(neg_int,1) return builder.CreateSub(ConstantInt::get(t, 0), JL_INT(x));
-    HANDLE(add_int,2) return builder.CreateAdd(JL_INT(x), JL_INT(y));
-    HANDLE(sub_int,2) return builder.CreateSub(JL_INT(x), JL_INT(y));
-    HANDLE(mul_int,2) return builder.CreateMul(JL_INT(x), JL_INT(y));
-    HANDLE(sdiv_int,2)
+    case neg_int: return builder.CreateSub(ConstantInt::get(t, 0), JL_INT(x));
+    case add_int: return builder.CreateAdd(JL_INT(x), JL_INT(y));
+    case sub_int: return builder.CreateSub(JL_INT(x), JL_INT(y));
+    case mul_int: return builder.CreateMul(JL_INT(x), JL_INT(y));
+    case sdiv_int:
         den = JL_INT(y);
         t = den->getType();
         x = JL_INT(x);
@@ -1089,24 +1087,24 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                prepare_global(jldiverr_var), ctx);
 
         return builder.CreateSDiv(x, den);
-    HANDLE(udiv_int,2)
+    case udiv_int:
         den = JL_INT(y);
         t = den->getType();
         raise_exception_unless(builder.CreateICmpNE(den, ConstantInt::get(t,0)),
                                prepare_global(jldiverr_var), ctx);
         return builder.CreateUDiv(JL_INT(x), den);
 
-    HANDLE(srem_int,2)
+    case srem_int:
         return emit_srem(JL_INT(x), JL_INT(y), ctx);
 
-    HANDLE(urem_int,2)
+    case urem_int:
         den = JL_INT(y);
         t = den->getType();
         raise_exception_unless(builder.CreateICmpNE(den, ConstantInt::get(t,0)),
                                prepare_global(jldiverr_var), ctx);
         return builder.CreateURem(JL_INT(x), den);
 
-    HANDLE(smod_int,2)
+    case smod_int:
         return emit_smod(JL_INT(x), JL_INT(y), ctx);
 
 // Implements IEEE negate. Unfortunately there is no compliant way
@@ -1114,25 +1112,25 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
 // that do the correct thing on LLVM <= 3.3 and >= 3.5 respectively.
 // See issue #7868
 #ifdef LLVM35
-    HANDLE(neg_float,1) return math_builder(ctx)().CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
-    HANDLE(neg_float_fast,1) return math_builder(ctx, true)().CreateFNeg(FP(x));
+    case neg_float: return math_builder(ctx)().CreateFSub(ConstantFP::get(FT(t), -0.0), FP(x));
+    case neg_float_fast: return math_builder(ctx, true)().CreateFNeg(FP(x));
 #else
-    HANDLE(neg_float,1)
+    case neg_float:
         return math_builder(ctx)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
-    HANDLE(neg_float_fast,1)
+    case neg_float_fast:
         return math_builder(ctx, true)().CreateFMul(ConstantFP::get(FT(t), -1.0), FP(x));
 #endif
-    HANDLE(add_float,2) return math_builder(ctx)().CreateFAdd(FP(x), FP(y));
-    HANDLE(sub_float,2) return math_builder(ctx)().CreateFSub(FP(x), FP(y));
-    HANDLE(mul_float,2) return math_builder(ctx)().CreateFMul(FP(x), FP(y));
-    HANDLE(div_float,2) return math_builder(ctx)().CreateFDiv(FP(x), FP(y));
-    HANDLE(rem_float,2) return math_builder(ctx)().CreateFRem(FP(x), FP(y));
-    HANDLE(add_float_fast,2) return math_builder(ctx, true)().CreateFAdd(FP(x), FP(y));
-    HANDLE(sub_float_fast,2) return math_builder(ctx, true)().CreateFSub(FP(x), FP(y));
-    HANDLE(mul_float_fast,2) return math_builder(ctx, true)().CreateFMul(FP(x), FP(y));
-    HANDLE(div_float_fast,2) return math_builder(ctx, true)().CreateFDiv(FP(x), FP(y));
-    HANDLE(rem_float_fast,2) return math_builder(ctx, true)().CreateFRem(FP(x), FP(y));
-    HANDLE(fma_float,3) {
+    case add_float: return math_builder(ctx)().CreateFAdd(FP(x), FP(y));
+    case sub_float: return math_builder(ctx)().CreateFSub(FP(x), FP(y));
+    case mul_float: return math_builder(ctx)().CreateFMul(FP(x), FP(y));
+    case div_float: return math_builder(ctx)().CreateFDiv(FP(x), FP(y));
+    case rem_float: return math_builder(ctx)().CreateFRem(FP(x), FP(y));
+    case add_float_fast: return math_builder(ctx, true)().CreateFAdd(FP(x), FP(y));
+    case sub_float_fast: return math_builder(ctx, true)().CreateFSub(FP(x), FP(y));
+    case mul_float_fast: return math_builder(ctx, true)().CreateFMul(FP(x), FP(y));
+    case div_float_fast: return math_builder(ctx, true)().CreateFDiv(FP(x), FP(y));
+    case rem_float_fast: return math_builder(ctx, true)().CreateFRem(FP(x), FP(y));
+    case fma_float: {
       assert(y->getType() == x->getType());
       assert(z->getType() == y->getType());
       Value *fmaintr = Intrinsic::getDeclaration(jl_Module, Intrinsic::fma,
@@ -1143,7 +1141,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
       return builder.CreateCall3(fmaintr, FP(x), FP(y), FP(z));
 #endif
     }
-    HANDLE(muladd_float,3)
+    case muladd_float:
 #ifdef LLVM34
     {
       assert(y->getType() == x->getType());
@@ -1167,12 +1165,12 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         CreateFAdd(builder.CreateFMul(FP(x), FP(y)), FP(z));
 #endif
 
-    HANDLE(checked_sadd,2)
-    HANDLE(checked_uadd,2)
-    HANDLE(checked_ssub,2)
-    HANDLE(checked_usub,2)
-    HANDLE(checked_smul,2)
-    HANDLE(checked_umul,2) {
+    case checked_sadd:
+    case checked_uadd:
+    case checked_ssub:
+    case checked_usub:
+    case checked_smul:
+    case checked_umul: {
         Value *ix = JL_INT(x); Value *iy = JL_INT(y);
         assert(ix->getType() == iy->getType());
         Value *intr =
@@ -1199,7 +1197,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         return builder.CreateExtractValue(res, ArrayRef<unsigned>(0));
     }
 
-    HANDLE(check_top_bit,1)
+    case check_top_bit:
         // raise InexactError if argument's top bit is set
         x = JL_INT(x);
         raise_exception_if(builder.
@@ -1209,24 +1207,24 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                            prepare_global(jlinexacterr_var), ctx);
         return x;
 
-    HANDLE(eq_int,2)  *newtyp = jl_bool_type; return builder.CreateICmpEQ(JL_INT(x), JL_INT(y));
-    HANDLE(ne_int,2)  *newtyp = jl_bool_type; return builder.CreateICmpNE(JL_INT(x), JL_INT(y));
-    HANDLE(slt_int,2) *newtyp = jl_bool_type; return builder.CreateICmpSLT(JL_INT(x), JL_INT(y));
-    HANDLE(ult_int,2) *newtyp = jl_bool_type; return builder.CreateICmpULT(JL_INT(x), JL_INT(y));
-    HANDLE(sle_int,2) *newtyp = jl_bool_type; return builder.CreateICmpSLE(JL_INT(x), JL_INT(y));
-    HANDLE(ule_int,2) *newtyp = jl_bool_type; return builder.CreateICmpULE(JL_INT(x), JL_INT(y));
+    case eq_int:  *newtyp = jl_bool_type; return builder.CreateICmpEQ(JL_INT(x), JL_INT(y));
+    case ne_int:  *newtyp = jl_bool_type; return builder.CreateICmpNE(JL_INT(x), JL_INT(y));
+    case slt_int: *newtyp = jl_bool_type; return builder.CreateICmpSLT(JL_INT(x), JL_INT(y));
+    case ult_int: *newtyp = jl_bool_type; return builder.CreateICmpULT(JL_INT(x), JL_INT(y));
+    case sle_int: *newtyp = jl_bool_type; return builder.CreateICmpSLE(JL_INT(x), JL_INT(y));
+    case ule_int: *newtyp = jl_bool_type; return builder.CreateICmpULE(JL_INT(x), JL_INT(y));
 
-    HANDLE(eq_float,2) *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpOEQ(FP(x), FP(y));
-    HANDLE(ne_float,2) *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpUNE(FP(x), FP(y));
-    HANDLE(lt_float,2) *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpOLT(FP(x), FP(y));
-    HANDLE(le_float,2) *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpOLE(FP(x), FP(y));
+    case eq_float: *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpOEQ(FP(x), FP(y));
+    case ne_float: *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpUNE(FP(x), FP(y));
+    case lt_float: *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpOLT(FP(x), FP(y));
+    case le_float: *newtyp = jl_bool_type; return math_builder(ctx)().CreateFCmpOLE(FP(x), FP(y));
 
-    HANDLE(eq_float_fast,2) *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpOEQ(FP(x), FP(y));
-    HANDLE(ne_float_fast,2) *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpUNE(FP(x), FP(y));
-    HANDLE(lt_float_fast,2) *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpOLT(FP(x), FP(y));
-    HANDLE(le_float_fast,2) *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpOLE(FP(x), FP(y));
+    case eq_float_fast: *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpOEQ(FP(x), FP(y));
+    case ne_float_fast: *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpUNE(FP(x), FP(y));
+    case lt_float_fast: *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpOLT(FP(x), FP(y));
+    case le_float_fast: *newtyp = jl_bool_type; return math_builder(ctx, true)().CreateFCmpOLE(FP(x), FP(y));
 
-    HANDLE(fpiseq,2) {
+    case fpiseq: {
         *newtyp = jl_bool_type;
         Value *xi = JL_INT(x);
         Value *yi = JL_INT(y);
@@ -1237,7 +1235,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                 builder.CreateICmpEQ(xi, yi));
     }
 
-    HANDLE(fpislt,2) {
+    case fpislt: {
         *newtyp = jl_bool_type;
         Value *xi = JL_INT(x);
         Value *yi = JL_INT(y);
@@ -1264,11 +1262,11 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         );
     }
 
-    HANDLE(and_int,2) return builder.CreateAnd(JL_INT(x), JL_INT(y));
-    HANDLE(or_int,2)  return builder.CreateOr(JL_INT(x), JL_INT(y));
-    HANDLE(xor_int,2) return builder.CreateXor(JL_INT(x), JL_INT(y));
-    HANDLE(not_int,1) return builder.CreateXor(JL_INT(x), ConstantInt::get(t, -1, true));
-    HANDLE(shl_int,2)
+    case and_int: return builder.CreateAnd(JL_INT(x), JL_INT(y));
+    case or_int:  return builder.CreateOr(JL_INT(x), JL_INT(y));
+    case xor_int: return builder.CreateXor(JL_INT(x), JL_INT(y));
+    case not_int: return builder.CreateXor(JL_INT(x), ConstantInt::get(t, -1, true));
+    case shl_int:
         x = JL_INT(x); y = JL_INT(y);
         return builder.
             CreateSelect(builder.
@@ -1276,7 +1274,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                                            x->getType()->getPrimitiveSizeInBits())),
                          ConstantInt::get(x->getType(),0),
                          builder.CreateShl(x, uint_cnvt(t,y)));
-    HANDLE(lshr_int,2)
+    case lshr_int:
         x = JL_INT(x); y = JL_INT(y);
         return builder.
             CreateSelect(builder.
@@ -1284,7 +1282,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                                            x->getType()->getPrimitiveSizeInBits())),
                          ConstantInt::get(x->getType(),0),
                          builder.CreateLShr(x, uint_cnvt(t,y)));
-    HANDLE(ashr_int,2)
+    case ashr_int:
         x = JL_INT(x); y = JL_INT(y);
         return builder.
             CreateSelect(builder.
@@ -1293,17 +1291,17 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                          builder.CreateAShr(x, ConstantInt::get(x->getType(),
                                                                 x->getType()->getPrimitiveSizeInBits()-1)),
                          builder.CreateAShr(x, uint_cnvt(t,y)));
-    HANDLE(bswap_int,1)
+    case bswap_int:
         x = JL_INT(x);
         return builder.CreateCall(
             Intrinsic::getDeclaration(jl_Module, Intrinsic::bswap,
                                       ArrayRef<Type*>(x->getType())), x);
-    HANDLE(ctpop_int,1)
+    case ctpop_int:
         x = JL_INT(x);
         return builder.CreateCall(
             Intrinsic::getDeclaration(jl_Module, Intrinsic::ctpop,
                                       ArrayRef<Type*>(x->getType())), x);
-    HANDLE(ctlz_int,1) {
+    case ctlz_int: {
         x = JL_INT(x);
         Type *types[1] = {x->getType()};
         Value *ctlz = Intrinsic::getDeclaration(jl_Module, Intrinsic::ctlz,
@@ -1314,7 +1312,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         return builder.CreateCall2(ctlz, x, ConstantInt::get(T_int1,0));
 #endif
     }
-    HANDLE(cttz_int,1) {
+    case cttz_int: {
         x = JL_INT(x);
         Type *types[1] = {x->getType()};
         Value *cttz = Intrinsic::getDeclaration(jl_Module, Intrinsic::cttz, ArrayRef<Type*>(types));
@@ -1325,7 +1323,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
 #endif
     }
 
-    HANDLE(nan_dom_err,2) {
+    case nan_dom_err: {
         // nan_dom_err(f, x) throw DomainError if isnan(f)&&!isnan(x)
         Value *f = FP(x); x = FP(y);
         raise_exception_unless(builder.CreateOr(builder.CreateFCmpORD(f,f),
@@ -1334,7 +1332,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         return f;
     }
 
-    HANDLE(abs_float,1)
+    case abs_float:
     {
         x = FP(x);
 #ifdef LLVM34
@@ -1350,7 +1348,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         return builder.CreateBitCast(absbits, x->getType());
 #endif
     }
-    HANDLE(copysign_float,2)
+    case copysign_float:
     {
         x = FP(x);
         fy = FP(y);
@@ -1369,7 +1367,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                                                 signbit0)));
         return builder.CreateBitCast(rbits, x->getType());
     }
-    HANDLE(flipsign_int,2)
+    case flipsign_int:
     {
         x = JL_INT(x);
         fy = JL_INT(y);
@@ -1388,31 +1386,31 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         Value *tmp = builder.CreateAShr(fy, ConstantInt::get(intt,((IntegerType*)intt)->getBitWidth()-1));
         return builder.CreateXor(builder.CreateAdd(x,tmp),tmp);
     }
-    HANDLE(ceil_llvm,1) {
+    case ceil_llvm: {
         x = FP(x);
         return builder.CreateCall(Intrinsic::getDeclaration(jl_Module, Intrinsic::ceil,
                                                             ArrayRef<Type*>(x->getType())),
                                   x);
     }
-    HANDLE(floor_llvm,1) {
+    case floor_llvm: {
         x = FP(x);
         return builder.CreateCall(Intrinsic::getDeclaration(jl_Module, Intrinsic::floor,
                                                             ArrayRef<Type*>(x->getType())),
                                   x);
     }
-    HANDLE(trunc_llvm,1) {
+    case trunc_llvm: {
         x = FP(x);
         return builder.CreateCall(Intrinsic::getDeclaration(jl_Module, Intrinsic::trunc,
                                                             ArrayRef<Type*>(x->getType())),
                                   x);
     }
-    HANDLE(rint_llvm,1) {
+    case rint_llvm: {
         x = FP(x);
         return builder.CreateCall(Intrinsic::getDeclaration(jl_Module, Intrinsic::rint,
                                                             ArrayRef<Type*>(x->getType())),
                                   x);
     }
-    HANDLE(sqrt_llvm,1) {
+    case sqrt_llvm: {
         x = FP(x);
         raise_exception_unless(builder.CreateFCmpUGE(x, ConstantFP::get(x->getType(),0.0)),
                                prepare_global(jldomerr_var), ctx);
@@ -1420,7 +1418,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
                                                             ArrayRef<Type*>(x->getType())),
                                   x);
     }
-    HANDLE(powi_llvm,2) {
+    case powi_llvm: {
         x = FP(x);
         y = JL_INT(y);
         Type *tx = x->getType(); // TODO: LLVM expects this to be i32
@@ -1441,7 +1439,7 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
         return builder.CreateCall2(pow, x, builder.CreateSIToFP(y, tx));
 #endif
     }
-    HANDLE(sqrt_llvm_fast,1) {
+    case sqrt_llvm_fast: {
         x = FP(x);
         return builder.CreateCall(Intrinsic::getDeclaration(jl_Module, Intrinsic::sqrt,
                                                             ArrayRef<Type*>(x->getType())),
@@ -1455,8 +1453,6 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
     return NULL;
 }
 
-#undef HANDLE
-
 static Function *boxfunc_llvm(FunctionType *ft, const std::string &cname,
                               void *addr, Module *m)
 {

From 5f327735e63e99734deb856eb4512f92d77e56a6 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Mon, 5 Oct 2015 18:34:16 -0400
Subject: [PATCH 03/11] introduce a call method for (runtime versions of)
 intrinsics. use it to have "reinterpret" do runtime error checking (fix
 #12832)

---
 src/cgutils.cpp      |  49 +++++++++++++--
 src/codegen.cpp      |  14 +++--
 src/dump.c           |   1 +
 src/intrinsics.cpp   | 139 ++++++++++++++++++++++++++++++++++++++++++-
 src/julia_internal.h |   1 +
 test/numbers.jl      |   6 ++
 6 files changed, 198 insertions(+), 12 deletions(-)

diff --git a/src/cgutils.cpp b/src/cgutils.cpp
index 7f0d2c5046bb3..8966741ae4a65 100644
--- a/src/cgutils.cpp
+++ b/src/cgutils.cpp
@@ -871,24 +871,22 @@ static Value *emit_typeof(const jl_cgval_t &p)
     return literal_pointer_val(aty);
 }
 
-static Value *emit_datatype_types(const jl_cgval_t &dt)
+static Value *emit_datatype_types(Value *dt)
 {
-    assert(dt.isboxed);
     return builder.
         CreateLoad(builder.
                    CreateBitCast(builder.
-                                 CreateGEP(builder.CreateBitCast(dt.V, T_pint8),
+                                 CreateGEP(builder.CreateBitCast(dt, T_pint8),
                                            ConstantInt::get(T_size, offsetof(jl_datatype_t, types))),
                                  T_ppjlvalue));
 }
 
-static Value *emit_datatype_nfields(const jl_cgval_t &dt)
+static Value *emit_datatype_nfields(Value *dt)
 {
-    assert(dt.isboxed);
     Value *nf = builder.
         CreateLoad(builder.
                    CreateBitCast(builder.
-                                 CreateGEP(builder.CreateBitCast(dt.V, T_pint8),
+                                 CreateGEP(builder.CreateBitCast(dt, T_pint8),
                                            ConstantInt::get(T_size, offsetof(jl_datatype_t, nfields))),
                                  T_pint32));
 #ifdef _P64
@@ -897,6 +895,45 @@ static Value *emit_datatype_nfields(const jl_cgval_t &dt)
     return nf;
 }
 
+static Value *emit_datatype_size(Value *dt)
+{
+    Value *size = builder.
+        CreateLoad(builder.
+                   CreateBitCast(builder.
+                                 CreateGEP(builder.CreateBitCast(dt, T_pint8),
+                                           ConstantInt::get(T_size, offsetof(jl_datatype_t, size))),
+                                 T_pint32));
+    return size;
+}
+
+static Value *emit_datatype_mutabl(Value *dt)
+{
+    Value *mutabl = builder.
+        CreateLoad(builder.CreateGEP(builder.CreateBitCast(dt, T_pint8),
+                                     ConstantInt::get(T_size, offsetof(jl_datatype_t, mutabl))));
+    return builder.CreateTrunc(mutabl, T_int1);
+}
+
+static Value *emit_datatype_abstract(Value *dt)
+{
+    Value *abstract = builder.
+        CreateLoad(builder.CreateGEP(builder.CreateBitCast(dt, T_pint8),
+                                     ConstantInt::get(T_size, offsetof(jl_datatype_t, abstract))));
+    return builder.CreateTrunc(abstract, T_int1);
+}
+
+static Value *emit_datatype_isbitstype(Value *dt)
+{
+    Value *immut = builder.CreateXor(emit_datatype_mutabl(dt), ConstantInt::get(T_int1, -1));
+    Value *nofields = builder.CreateICmpEQ(emit_datatype_nfields(dt), ConstantInt::get(T_size, 0));
+    Value *isbitstype = builder.CreateAnd(immut, builder.CreateAnd(nofields,
+            builder.CreateXor(builder.CreateAnd(emit_datatype_abstract(dt),
+                    builder.CreateICmpSGT(emit_datatype_size(dt), ConstantInt::get(T_int32, 0))),
+                ConstantInt::get(T_int1, -1))));
+    return isbitstype;
+}
+
+
 // --- generating various error checks ---
 
 static void just_emit_error(const std::string &txt, jl_codectx_t *ctx)
diff --git a/src/codegen.cpp b/src/codegen.cpp
index d88be7d4be5ed..220b6573f3b62 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -2647,10 +2647,13 @@ static bool emit_known_call(jl_cgval_t *ret, jl_value_t *ff,
         else if (jl_is_leaf_type(aty)) {
             jl_cgval_t arg1 = emit_expr(args[1], ctx);
             Value *sz;
-            if (aty == (jl_value_t*)jl_datatype_type)
-                sz = emit_datatype_nfields(arg1);
-            else
+            if (aty == (jl_value_t*)jl_datatype_type) {
+                assert(arg1.isboxed);
+                sz = emit_datatype_nfields(arg1.V);
+            }
+            else {
                 sz = ConstantInt::get(T_size, jl_datatype_nfields(aty));
+            }
             *ret = mark_julia_type(sz, false, jl_long_type);
             JL_GC_POP();
             return true;
@@ -2664,8 +2667,9 @@ static bool emit_known_call(jl_cgval_t *ret, jl_value_t *ff,
             rt2 = expr_type(args[2], ctx); // index argument type
             if (rt2 == (jl_value_t*)jl_long_type) {
                 jl_cgval_t ty = emit_expr(args[1], ctx);
-                Value *types_svec = emit_datatype_types(ty);
-                Value *types_len = emit_datatype_nfields(ty);
+                assert(ty.isboxed);
+                Value *types_svec = emit_datatype_types(ty.V);
+                Value *types_len = emit_datatype_nfields(ty.V);
                 Value *idx = emit_unbox(T_size, emit_unboxed(args[2], ctx), (jl_value_t*)jl_long_type);
                 emit_bounds_check(ty, (jl_value_t*)jl_datatype_type, idx, types_len, ctx);
                 Value *fieldtyp = builder.CreateLoad(builder.CreateGEP(builder.CreateBitCast(types_svec, T_ppjlvalue), idx));
diff --git a/src/dump.c b/src/dump.c
index e87b27aac7184..da91aa1f86e95 100644
--- a/src/dump.c
+++ b/src/dump.c
@@ -78,6 +78,7 @@ static jl_fptr_t id_to_fptrs[] = {
   jl_f_instantiate_type, jl_f_kwcall, jl_trampoline,
   jl_f_methodexists, jl_f_applicable, jl_f_invoke,
   jl_apply_generic, jl_unprotect_stack, jl_f_sizeof, jl_f_new_expr,
+  jl_f_intrinsic_call,
   NULL };
 
 // pointers to non-AST-ish objects in a compressed tree
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index e92583d126fe6..2802a51867322 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -52,6 +52,7 @@ namespace JL_I {
 
 using namespace JL_I;
 Function *runtime_func[num_intrinsics];
+void* runtime_fp[num_intrinsics];
 unsigned intrinsic_nargs[num_intrinsics];
 
 #include "ccall.cpp"
@@ -409,6 +410,94 @@ int get_bitstype_nbits(jl_value_t *bt)
 
 // put a bits type tag on some value (despite the name, this doesn't necessarily actually "box" the value however)
 static jl_cgval_t generic_box(jl_value_t *targ, jl_value_t *x, jl_codectx_t *ctx)
+{
+    // Examine the first argument //
+    jl_value_t *bt = static_eval(targ, ctx, true, true);
+    if (bt && !jl_is_leaf_type(bt)) {
+        jl_add_linfo_root(ctx->linfo, bt);
+    }
+
+    if (!bt || !jl_is_bitstype(bt)) {
+        // it's easier to throw a good error from C than llvm
+        if (bt) targ = bt;
+        int last_depth = ctx->gc.argDepth;
+        Value *arg1 = emit_boxed_rooted(targ, ctx).V;
+        Value *arg2 = emit_boxed_rooted(x, ctx).V;
+        Value *func = prepare_call(runtime_func[box]);
+#ifdef LLVM37
+        Value *r = builder.CreateCall(func, {arg1, arg2});
+#else
+        Value *r = builder.CreateCall2(func, arg1, arg2);
+#endif
+        ctx->gc.argDepth = last_depth;
+        jl_value_t *et = expr_type(targ, ctx);
+        return mark_julia_type(r, true, jl_is_type_type(et) ? jl_tparam0(et) : (jl_value_t*)jl_any_type);
+    }
+
+    Type *llvmt = staticeval_bitstype(bt);
+    int nb = jl_datatype_size(bt);
+
+    // Examine the second argument //
+    jl_cgval_t v = emit_unboxed(x, ctx);
+    bool isboxed;
+    Type *vxt = julia_type_to_llvm(v.typ, &isboxed);
+
+    if (!jl_is_datatype(v.typ)
+        || !jl_is_bitstype(v.typ)
+        || jl_datatype_size(v.typ) != nb) {
+        Value *typ = emit_typeof(v);
+        if (!jl_is_bitstype(v.typ)) {
+            if (isboxed) {
+                Value *isbits = emit_datatype_isbitstype(typ);
+                error_unless(isbits, "reinterpret: expected bitstype value for second argument", ctx);
+            }
+            else {
+                emit_error("reinterpet: expected bitstype value for second argument", ctx);
+                return jl_cgval_t();
+            }
+        }
+        if (jl_datatype_size(v.typ) != nb) {
+            if (isboxed) {
+                Value *size = emit_datatype_size(typ);
+                error_unless(builder.CreateICmpEQ(size, ConstantInt::get(T_int32, nb)),
+                            "reinterpet: argument size does not match size of target type", ctx);
+            }
+            else {
+                emit_error("reinterpet: argument size does not match size of target type", ctx);
+                return jl_cgval_t();
+            }
+        }
+    }
+
+    Value *vx = v.V;
+    if (v.ispointer) {
+        vx = v.V;
+        if (isboxed) // try to load as original Type, to preserve llvm optimizations
+            vxt = llvmt; // but if the v.typ is not well known, use T
+        if (vx->getType()->getPointerElementType() != vxt)
+            vx = builder.CreatePointerCast(vx, vxt->getPointerTo());
+        vx = builder.CreateLoad(vx);
+    }
+
+    vxt = vx->getType();
+    if (vxt != llvmt) {
+        if (llvmt == T_int1)
+            vx = builder.CreateTrunc(vx, llvmt);
+        else if (vxt == T_int1 && llvmt == T_int8)
+            vx = builder.CreateZExt(vx, llvmt);
+        else if (vxt->isPointerTy() && !llvmt->isPointerTy())
+            vx = builder.CreatePtrToInt(vx, llvmt);
+        else if (!vxt->isPointerTy() && llvmt->isPointerTy())
+            vx = builder.CreateIntToPtr(vx, llvmt);
+        else
+            vx = builder.CreateBitCast(vx, llvmt);
+    }
+
+    return mark_julia_type(vx, false, bt);
+}
+
+// put a bits type tag on some value
+static jl_cgval_t generic_unbox(jl_value_t *targ, jl_value_t *x, jl_codectx_t *ctx)
 {
     // Examine the first argument //
     jl_value_t *bt = staticeval_bitstype(targ, NULL, ctx);
@@ -857,18 +946,30 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         Value *func = prepare_call(runtime_func[f]);
         if (nargs == 1) {
             Value *x = emit_boxed_rooted(args[1], ctx).V;
+#ifdef LLVM37
+            r = builder.CreateCall(func, {x});
+#else
             r = builder.CreateCall(func, x);
+#endif
         }
         else if (nargs == 2) {
             Value *x = emit_boxed_rooted(args[1], ctx).V;
             Value *y = emit_boxed_rooted(args[2], ctx).V;
+#ifdef LLVM37
+            r = builder.CreateCall(func, {x, y});
+#else
             r = builder.CreateCall2(func, x, y);
+#endif
         }
         else if (nargs == 3) {
             Value *x = emit_boxed_rooted(args[1], ctx).V;
             Value *y = emit_boxed_rooted(args[2], ctx).V;
             Value *z = emit_boxed_rooted(args[3], ctx).V;
+#ifdef LLVM37
+            r = builder.CreateCall(func, {x, y, z});
+#else
             r = builder.CreateCall3(func, x, y, z);
+#endif
         }
         else {
             assert(0);
@@ -883,7 +984,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
     case box:
         return generic_box(args[1], args[2], ctx);
     case unbox:
-        return generic_box(args[1], args[2], ctx);
+        return generic_unbox(args[1], args[2], ctx); // TODO: replace with generic_box
     case trunc_int:
         return generic_trunc(args[1], args[2], ctx, false, false);
     case checked_trunc_sint:
@@ -1453,6 +1554,36 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
     return NULL;
 }
 
+typedef jl_value_t *(*intrinsic_call_1_arg)(jl_value_t*);
+typedef jl_value_t *(*intrinsic_call_2_arg)(jl_value_t*, jl_value_t*);
+typedef jl_value_t *(*intrinsic_call_3_arg)(jl_value_t*, jl_value_t*, jl_value_t*);
+#define jl_is_intrinsic(v)       jl_typeis(v,jl_intrinsic_type)
+
+JL_CALLABLE(jl_f_intrinsic_call)
+{
+    JL_NARGSV(intrinsic_call, 1);
+    JL_TYPECHK(intrinsic_call, intrinsic, args[0]);
+    intrinsic f = (intrinsic)*(uint32_t*)jl_data_ptr(args[0]);
+    if (f == fptoui && nargs == 1)
+        f = fptoui_auto;
+    if (f == fptosi && nargs == 1)
+        f = fptosi_auto;
+    unsigned fargs = intrinsic_nargs[f];
+    JL_NARGS(intrinsic_call, 1 + fargs, 1 + fargs);
+    switch (fargs) {
+        case 1:
+            return ((intrinsic_call_1_arg)runtime_fp[f])(args[1]);
+        case 2:
+            return ((intrinsic_call_2_arg)runtime_fp[f])(args[1], args[2]);
+        case 3:
+            return ((intrinsic_call_3_arg)runtime_fp[f])(args[1], args[2], args[3]);
+        default:
+            assert(0 && "unexpected number of arguments to an intrinsic function");
+    }
+    abort();
+}
+
+
 static Function *boxfunc_llvm(FunctionType *ft, const std::string &cname,
                               void *addr, Module *m)
 {
@@ -1540,6 +1671,9 @@ extern "C" void jl_init_intrinsic_functions(void)
     //ADD_I(fptosi_auto); ADD_I(fptoui_auto); // these intrinsics are "hidden" in fpto*i
     ADD_I(ccall); ADD_I(cglobal);
     ADD_I(llvmcall);
+
+    jl_set_const(inm, jl_symbol("intrinsic_call"),
+            (jl_value_t*)jl_new_closure(jl_f_intrinsic_call, (jl_value_t*)jl_symbol("intrinsic_call"), NULL));
 }
 #undef ADD_I
 
@@ -1550,12 +1684,14 @@ static void add_intrinsic_to_codegen(Module *m, const std::string &name, intrins
     runtime_func[f] = func;
     add_named_global(func, pfunc);
     intrinsic_nargs[f] = nargs;
+    runtime_fp[f] = pfunc;
 }
 
 static void add_intrinsic_to_codegen(intrinsic alias, intrinsic base)
 {
     runtime_func[alias] = runtime_func[base];
     intrinsic_nargs[alias] = intrinsic_nargs[base];
+    runtime_fp[alias] = runtime_fp[base];
 }
 
 #define ADD_I(name, nargs) add_intrinsic_to_codegen(m, "jl_" #name, name, nargs, args##nargs, (void*)&jl_##name)
@@ -1613,6 +1749,7 @@ static void jl_init_intrinsic_functions_codegen(Module *m)
     ADD_I(check_top_bit, 1);
     ADD_I(nan_dom_err, 2);
     ADD_I(fptosi_auto, 1); ADD_I(fptoui_auto, 1);
+
 }
 
 #undef ADD_I
diff --git a/src/julia_internal.h b/src/julia_internal.h
index d3798c1964f96..86490022e2340 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -75,6 +75,7 @@ JL_CALLABLE(jl_apply_generic);
 JL_CALLABLE(jl_unprotect_stack);
 JL_CALLABLE(jl_f_no_function);
 JL_CALLABLE(jl_f_tuple);
+JL_CALLABLE(jl_f_intrinsic_call);
 extern jl_function_t *jl_unprotect_stack_func;
 extern jl_function_t *jl_bottom_func;
 void jl_install_default_signal_handlers(void);
diff --git a/test/numbers.jl b/test/numbers.jl
index 76b10e3f0fe32..aa78abd831b93 100644
--- a/test/numbers.jl
+++ b/test/numbers.jl
@@ -2353,6 +2353,12 @@ end
 # issue #7508
 @test_throws ErrorException reinterpret(Int, 0x01)
 
+# issue #12832
+@test_throws ErrorException reinterpret(Float64, Complex{Int64}(1))
+@test_throws ErrorException reinterpret(Float64, Complex64(1))
+@test_throws ErrorException reinterpret(Complex64, Float64(1))
+@test_throws ErrorException reinterpret(Int32, false)
+
 # issue #41
 ndigf(n) = Float64(log(Float32(n)))
 @test Float64(log(Float32(256))) == ndigf(256) == 5.545177459716797

From ac66b65212c273cbbf2934b219b25eafb384e2d6 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 6 Oct 2015 16:09:47 -0400
Subject: [PATCH 04/11] create a compile option for a Julia that is not linked
 against llvm (other than libLLVMSupport.a for APInt support)

---
 Make.inc                 |   1 +
 src/Makefile             |  39 +++---
 src/alloc.c              |   9 ++
 src/anticodegen.c        |  50 ++++++++
 src/ccall.cpp            | 128 +------------------
 src/codegen.cpp          |  20 ---
 src/intrinsics.cpp       | 264 ++++++---------------------------------
 src/intrinsics.h         | 207 ++++++++++++++++++++++++++++++
 src/julia_internal.h     |  10 +-
 src/runtime_ccall.cpp    | 140 +++++++++++++++++++++
 src/runtime_intrinsics.c |   2 +-
 src/sys.c                |  14 +--
 12 files changed, 485 insertions(+), 399 deletions(-)
 create mode 100644 src/anticodegen.c
 create mode 100644 src/intrinsics.h
 create mode 100644 src/runtime_ccall.cpp

diff --git a/Make.inc b/Make.inc
index 9e234c2ac1be1..4c1804c8fc707 100644
--- a/Make.inc
+++ b/Make.inc
@@ -266,6 +266,7 @@ EXE :=
 endif
 
 JULIAGC := MARKSWEEP
+JULIACODEGEN := LLVM
 USE_COPY_STACKS := 1
 
 # flag for disabling assertions
diff --git a/src/Makefile b/src/Makefile
index 2731b897445e2..5f3cab29a04e2 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -11,16 +11,6 @@ override CFLAGS += $(JCFLAGS)
 override CXXFLAGS += $(JCXXFLAGS)
 override CPPFLAGS += $(JCPPFLAGS)
 
-SRCS := \
-	jltypes gf ast builtins module codegen disasm debuginfo interpreter \
-	alloc dlload sys init task array dump toplevel jl_uv jlapi signal-handling \
-	llvm-simdloop simplevector APInt-C runtime_intrinsics
-ifeq ($(JULIAGC),MARKSWEEP)
-SRCS += gc
-endif
-
-HEADERS := $(addprefix $(SRCDIR)/,julia.h julia_internal.h options.h) $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(LIBUV_INC)/uv.h
-
 # -I BUILDDIR comes before -I SRCDIR so that the user can override <options.h> on a per-build-directory basis
 #  for gcc/clang, suggested content is:
 #  #include_next <options.h>
@@ -28,17 +18,37 @@ HEADERS := $(addprefix $(SRCDIR)/,julia.h julia_internal.h options.h) $(BUILDDIR
 FLAGS := \
 	-D_GNU_SOURCE -I$(BUILDDIR) -I$(SRCDIR) \
 	-I$(SRCDIR)/flisp -I$(SRCDIR)/support \
-	-I$(shell $(LLVM_CONFIG_HOST) --includedir) \
 	-I$(LIBUV_INC) -I$(build_includedir) -DLIBRARY_EXPORTS \
 	-I$(JULIAHOME)/deps/valgrind
 ifneq ($(USEMSVC), 1)
 FLAGS += -Wall -Wno-strict-aliasing -fno-omit-frame-pointer -fvisibility=hidden -fno-common
 endif
 
+
+SRCS := \
+	jltypes gf ast builtins module interpreter \
+	alloc dlload sys init task array dump toplevel jl_uv jlapi signal-handling \
+	simplevector APInt-C runtime_intrinsics runtime_ccall
+ifeq ($(JULIAGC),MARKSWEEP)
+SRCS += gc
+endif
+
+ifeq ($(JULIACODEGEN),LLVM)
+SRCS += codegen disasm debuginfo llvm-simdloop
+FLAGS += -I$(shell $(LLVM_CONFIG_HOST) --includedir)
+LLVM_LIBS := all
+else
+SRCS += anticodegen
+LLVM_LIBS := support
+endif
+
+HEADERS := $(addprefix $(SRCDIR)/,julia.h julia_internal.h options.h) $(BUILDDIR)/julia_version.h $(wildcard $(SRCDIR)/support/*.h) $(LIBUV_INC)/uv.h
+
 # In LLVM < 3.4, --ldflags includes both options and libraries, so use it both before and after --libs
 # In LLVM >= 3.4, --ldflags has only options, and --system-libs has the libraries.
-LLVMLINK := $(shell $(LLVM_CONFIG_HOST) --ldflags) $(shell $(LLVM_CONFIG_HOST) --libs) $(shell $(LLVM_CONFIG_HOST) --ldflags) $(shell $(LLVM_CONFIG_HOST) --system-libs 2> /dev/null)
-ifeq ($(USE_LLVM_SHLIB),1)
+ifneq ($(USE_LLVM_SHLIB),1)
+LLVMLINK := $(shell $(LLVM_CONFIG_HOST) --ldflags) $(shell $(LLVM_CONFIG_HOST) --libs $(LLVM_LIBS)) $(shell $(LLVM_CONFIG_HOST) --ldflags) $(shell $(LLVM_CONFIG_HOST) --system-libs 2> /dev/null)
+else
 ifeq ($(LLVM_USE_CMAKE),1)
 LLVMLINK := $(shell $(LLVM_CONFIG_HOST) --ldflags) -lLLVM
 else
@@ -102,7 +112,8 @@ $(BUILDDIR)/julia_flisp.boot: $(addprefix $(SRCDIR)/,jlfrontend.scm \
 		$(call cygpath_w,$(SRCDIR)/mk_julia_flisp_boot.scm) $(call cygpath_w,$(dir $<)) $(notdir $<) $(call cygpath_w,$@))
 
 $(BUILDDIR)/ast.o $(BUILDDIR)/ast.dbg.obj: $(BUILDDIR)/julia_flisp.boot.inc $(SRCDIR)/flisp/*.h
-$(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,intrinsics.cpp cgutils.cpp ccall.cpp abi_*.cpp)
+$(BUILDDIR)/codegen.o $(BUILDDIR)/codegen.dbg.obj: $(addprefix $(SRCDIR)/,intrinsics.cpp intrinsics.h cgutils.cpp ccall.cpp abi_*.cpp)
+$(BUILDDIR)/anticodegen.o $(BUILDDIR)/anticodegen.dbg.obj: $(SRCDIR)/intrinsics.h
 $(BUILDDIR)/builtins.o $(BUILDDIR)/builtins.dbg.obj: $(SRCDIR)/table.c
 $(BUILDDIR)/gc.o $(BUILDDIR)/gc.dbg.obj: $(SRCDIR)/gc-debug.c
 $(BUILDDIR)/signal-handling.o $(BUILDDIR)/signal-handling.dbg.obj: $(addprefix $(SRCDIR)/,signals-*.c)
diff --git a/src/alloc.c b/src/alloc.c
index 52be1ed3b2a2b..02b51a8dcf350 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -819,6 +819,15 @@ jl_value_t *jl_box_bool(int8_t x)
     return jl_false;
 }
 
+DLLEXPORT jl_value_t *jl_new_box(jl_value_t *v)
+{
+    jl_value_t *box = (jl_value_t*)jl_gc_alloc_1w();
+    jl_set_typeof(box, jl_box_any_type);
+    // if (v) jl_gc_wb(box, v); // write block not needed: box was just allocated
+    box->fieldptr[0] = v;
+    return box;
+}
+
 // Expr constructor for internal use ------------------------------------------
 
 jl_expr_t *jl_exprn(jl_sym_t *head, size_t n)
diff --git a/src/anticodegen.c b/src/anticodegen.c
new file mode 100644
index 0000000000000..dc78e22d85487
--- /dev/null
+++ b/src/anticodegen.c
@@ -0,0 +1,50 @@
+#include "julia.h"
+#include "julia_internal.h"
+
+#include "intrinsics.h"
+
+int globalUnique = 0;
+
+#define UNAVAILABLE { jl_errorf("%s: not available in this build of Julia", __func__); }
+
+void jl_dump_bitcode(char *fname, const char *sysimg_data, size_t sysimg_len) UNAVAILABLE
+void jl_dump_objfile(char *fname, int jit_model, const char *sysimg_data, size_t sysimg_len) UNAVAILABLE
+int32_t jl_get_llvm_gv(jl_value_t *p) UNAVAILABLE
+void jl_write_malloc_log(void) UNAVAILABLE
+void jl_write_coverage_data(void) UNAVAILABLE
+void jl_generate_fptr(jl_function_t *f) {
+    jl_lambda_info_t *li = f->linfo;
+    if (li->fptr == &jl_trampoline) UNAVAILABLE
+    f->fptr = li->fptr;
+}
+
+DLLEXPORT void jl_clear_malloc_data(void) UNAVAILABLE
+DLLEXPORT void jl_extern_c(jl_function_t *f, jl_value_t *rt, jl_value_t *argt, char *name) UNAVAILABLE
+DLLEXPORT void *jl_function_ptr(jl_function_t *f, jl_value_t *rt, jl_value_t *argt) UNAVAILABLE
+DLLEXPORT const jl_value_t *jl_dump_function_asm(void *f, int raw_mc) UNAVAILABLE
+DLLEXPORT const jl_value_t *jl_dump_function_ir(void *f, uint8_t strip_ir_metadata, uint8_t dump_module) UNAVAILABLE
+
+void jl_init_codegen(void) { }
+void jl_compile(jl_function_t *f) { }
+void jl_fptr_to_llvm(void *fptr, jl_lambda_info_t *lam, int specsig)
+{
+    if (!specsig)
+        lam->fptr = (jl_fptr_t)fptr;
+}
+void jl_getFunctionInfo(char **name, char **filename, size_t *line,
+                        char **inlinedat_file, size_t *inlinedat_line,
+                        size_t pointer, int *fromC, int skipC, int skipInline)
+{
+    *name = NULL;
+    *line = -1;
+    *filename = NULL;
+    *inlinedat_file = NULL;
+    *inlinedat_line = -1;
+    *fromC = 0;
+}
+
+jl_value_t *jl_static_eval(jl_value_t *ex, void *ctx_, jl_module_t *mod,
+                           jl_value_t *sp, jl_expr_t *ast, int sparams, int allow_alloc)
+{
+    return NULL;
+}
diff --git a/src/ccall.cpp b/src/ccall.cpp
index 819818f1bb9f9..7c14ce160750d 100644
--- a/src/ccall.cpp
+++ b/src/ccall.cpp
@@ -1,126 +1,6 @@
 // This file is a part of Julia. License is MIT: http://julialang.org/license
 
-// --- the ccall intrinsic ---
-
-// --- library symbol lookup ---
-
-// map from "libX" to full soname "libX.so.ver"
-#if defined(__linux__) || defined(__FreeBSD__)
-static std::map<std::string, std::string> sonameMap;
-static bool got_sonames = false;
-
-extern "C" DLLEXPORT void jl_read_sonames(void)
-{
-    char *line=NULL;
-    size_t sz=0;
-#if defined(__linux__)
-    FILE *ldc = popen("/sbin/ldconfig -p", "r");
-#else
-    FILE *ldc = popen("/sbin/ldconfig -r", "r");
-#endif
-
-    while (!feof(ldc)) {
-        ssize_t n = getline(&line, &sz, ldc);
-        if (n == -1)
-            break;
-        if (n > 2 && isspace((unsigned char)line[0])) {
-#ifdef __linux__
-            int i = 0;
-            while (isspace((unsigned char)line[++i])) ;
-            char *name = &line[i];
-            char *dot = strstr(name, ".so");
-            i = 0;
-#else
-            char *name = strstr(line, ":-l");
-            if (name == NULL) continue;
-            strncpy(name, "lib", 3);
-            char *dot = strchr(name, '.');
-#endif
-
-            if (NULL == dot)
-                continue;
-
-#ifdef __linux__
-            // Detect if this entry is for the current architecture
-            while (!isspace((unsigned char)dot[++i])) ;
-            while (isspace((unsigned char)dot[++i])) ;
-            int j = i;
-            while (!isspace((unsigned char)dot[++j])) ;
-            char *arch = strstr(dot+i,"x86-64");
-            if (arch != NULL && arch < dot + j) {
-#ifdef _P32
-                continue;
-#endif
-            }
-            else {
-#ifdef _P64
-                continue;
-#endif
-            }
-#endif // __linux__
-
-            char *abslibpath = strrchr(line, ' ');
-            if (dot != NULL && abslibpath != NULL) {
-                std::string pfx(name, dot - name);
-                // Do not include ' ' in front and '\n' at the end
-                std::string soname(abslibpath+1, line+n-(abslibpath+1)-1);
-                sonameMap[pfx] = soname;
-            }
-        }
-    }
-
-    free(line);
-    pclose(ldc);
-}
-
-extern "C" DLLEXPORT const char *jl_lookup_soname(const char *pfx, size_t n)
-{
-    if (!got_sonames) {
-        jl_read_sonames();
-        got_sonames = true;
-    }
-    std::string str(pfx, n);
-    if (sonameMap.find(str) != sonameMap.end()) {
-        return sonameMap[str].c_str();
-    }
-    return NULL;
-}
-#endif
-
-// map from user-specified lib names to handles
-static std::map<std::string, uv_lib_t*> libMap;
-
-static uv_lib_t *get_library(char *lib)
-{
-    uv_lib_t *hnd;
-#ifdef _OS_WINDOWS_
-    if ((intptr_t)lib == 1)
-        return jl_exe_handle;
-    if ((intptr_t)lib == 2)
-        return jl_dl_handle;
-#endif
-    if (lib == NULL)
-        return jl_RTLD_DEFAULT_handle;
-    hnd = libMap[lib];
-    if (hnd != NULL)
-        return hnd;
-    hnd = (uv_lib_t *) jl_load_dynamic_library(lib, JL_RTLD_DEFAULT);
-    if (hnd != NULL)
-        libMap[lib] = hnd;
-    return hnd;
-}
-
-extern "C" DLLEXPORT
-void *jl_load_and_lookup(char *f_lib, char *f_name, uv_lib_t **hnd)
-{
-    uv_lib_t *handle = *hnd;
-    if (!handle)
-        *hnd = handle = get_library(f_lib);
-    void *ptr = jl_dlsym_e(handle, f_name);
-    if (!ptr)
-        jl_errorf("symbol \"%s\" could not be found: %s", f_name, uv_dlerror(handle));
-    return ptr;
-}
+// --- the ccall, cglobal, and llvm intrinsics ---
 
 static std::map<std::string, GlobalVariable*> libMapGV;
 static std::map<std::string, GlobalVariable*> symMapGV;
@@ -161,7 +41,7 @@ static Value *runtime_sym_lookup(PointerType *funcptype, char *f_lib, char *f_na
                false, GlobalVariable::PrivateLinkage,
                initnul, f_lib);
             libMapGV[f_lib] = libptrgv;
-            libsym = get_library(f_lib);
+            libsym = jl_get_library(f_lib);
             assert(libsym != NULL);
 #ifdef USE_MCJIT
             jl_llvm_to_jl_value[libptrgv] = libsym;
@@ -565,7 +445,7 @@ static jl_cgval_t emit_cglobal(jl_value_t **args, size_t nargs, jl_codectx_t *ct
             res = runtime_sym_lookup((PointerType*)lrt, sym.f_lib, sym.f_name, ctx);
         }
         else {
-            void *symaddr = jl_dlsym_e(get_library(sym.f_lib), sym.f_name);
+            void *symaddr = jl_dlsym_e(jl_get_library(sym.f_lib), sym.f_name);
             if (symaddr == NULL) {
                 std::stringstream msg;
                 msg << "cglobal: could not find symbol ";
@@ -1370,7 +1250,7 @@ static jl_cgval_t emit_ccall(jl_value_t **args, size_t nargs, jl_codectx_t *ctx)
             llvmf = runtime_sym_lookup(funcptype, f_lib, f_name, ctx);
         }
         else {
-            void *symaddr = jl_dlsym_e(get_library(f_lib), f_name);
+            void *symaddr = jl_dlsym_e(jl_get_library(f_lib), f_name);
             if (symaddr == NULL) {
                 JL_GC_POP();
                 std::stringstream msg;
diff --git a/src/codegen.cpp b/src/codegen.cpp
index 220b6573f3b62..923bc06f99e1c 100644
--- a/src/codegen.cpp
+++ b/src/codegen.cpp
@@ -658,17 +658,6 @@ extern "C" {
     int globalUnique = 0;
 }
 
-extern "C" DLLEXPORT
-jl_value_t *jl_get_cpu_name(void)
-{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 5
-    std::string HostCPUName = llvm::sys::getHostCPUName();
-#else
-    StringRef HostCPUName = llvm::sys::getHostCPUName();
-#endif
-    return jl_pchar_to_string(HostCPUName.data(), HostCPUName.size());
-}
-
 static void emit_write_barrier(jl_codectx_t*, Value*, Value*);
 
 #include "cgutils.cpp"
@@ -5208,15 +5197,6 @@ extern "C" void jl_fptr_to_llvm(void *fptr, jl_lambda_info_t *lam, int specsig)
     }
 }
 
-extern "C" DLLEXPORT jl_value_t *jl_new_box(jl_value_t *v)
-{
-    jl_value_t *box = (jl_value_t*)jl_gc_alloc_1w();
-    jl_set_typeof(box, jl_box_any_type);
-    // if (v) jl_gc_wb(box, v); // write block not needed: box was just allocated
-    box->fieldptr[0] = v;
-    return box;
-}
-
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 3 && SYSTEM_LLVM
 #define INSTCOMBINE_BUG
 #define V128_BUG
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 2802a51867322..88f7e1734d877 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -1,61 +1,39 @@
 // This file is a part of Julia. License is MIT: http://julialang.org/license
 
 namespace JL_I {
-    enum intrinsic {
-        // wrap and unwrap
-        box=0, unbox,
-        // arithmetic
-        neg_int, add_int, sub_int, mul_int,
-        sdiv_int, udiv_int, srem_int, urem_int, smod_int,
-        neg_float, add_float, sub_float, mul_float, div_float, rem_float,
-        fma_float, muladd_float,
-        // fast arithmetic
-        neg_float_fast, add_float_fast, sub_float_fast,
-        mul_float_fast, div_float_fast, rem_float_fast,
-        // same-type comparisons
-        eq_int,  ne_int,
-        slt_int, ult_int,
-        sle_int, ule_int,
-        eq_float, ne_float,
-        lt_float, le_float,
-        eq_float_fast, ne_float_fast,
-        lt_float_fast, le_float_fast,
-        fpiseq, fpislt,
-        // bitwise operators
-        and_int, or_int, xor_int, not_int, shl_int, lshr_int, ashr_int,
-        bswap_int, ctpop_int, ctlz_int, cttz_int,
-        // conversion
-        sext_int, zext_int, trunc_int,
-        fptoui, fptosi, uitofp, sitofp,
-        fptrunc, fpext,
-        // checked conversion
-        checked_fptosi, checked_fptoui,
-        checked_trunc_sint, checked_trunc_uint, check_top_bit,
-        // checked arithmetic
-        checked_sadd, checked_uadd, checked_ssub, checked_usub,
-        checked_smul, checked_umul,
-        nan_dom_err,
-        // functions
-        abs_float, copysign_float, flipsign_int, select_value,
-        ceil_llvm, floor_llvm, trunc_llvm, rint_llvm,
-        sqrt_llvm, powi_llvm,
-        sqrt_llvm_fast,
-        // pointer access
-        pointerref, pointerset,
-        // c interface
-        ccall, cglobal, llvmcall,
-        // terminator
-        fptoui_auto, fptosi_auto,
-        num_intrinsics
-    };
+#include "intrinsics.h"
 }
 
+#include "ccall.cpp"
+
 using namespace JL_I;
-Function *runtime_func[num_intrinsics];
-void* runtime_fp[num_intrinsics];
-unsigned intrinsic_nargs[num_intrinsics];
+static Function *runtime_func[num_intrinsics];
+static void jl_init_intrinsic_functions_codegen(Module *m)
+{
+    std::vector<Type *> args1(0); \
+    args1.push_back(T_pjlvalue); \
+    std::vector<Type *> args2(0); \
+    args2.push_back(T_pjlvalue); \
+    args2.push_back(T_pjlvalue); \
+    std::vector<Type *> args3(0); \
+    args3.push_back(T_pjlvalue); \
+    args3.push_back(T_pjlvalue); \
+    args3.push_back(T_pjlvalue);
 
-#include "ccall.cpp"
+#define ADD_I(name, nargs) do { \
+        Function *func = Function::Create(FunctionType::get(T_pjlvalue, args##nargs, false), \
+                                          Function::ExternalLinkage, "jl_"#name, m); \
+        runtime_func[name] = func; \
+        add_named_global(func, (void*)&jl_##name); \
+    } while (0);
+#define ADD_HIDDEN ADD_I
+#define ALIAS(alias, base) runtime_func[alias] = runtime_func[base];
+    ADD_HIDDEN(reinterpret, 2);
+    INTRINSICS
+#undef ADD_I
+#undef ADD_HIDDEN
+#undef ALIAS
+}
 
 /*
   low-level intrinsics design: TODO: fix description below
@@ -423,7 +401,7 @@ static jl_cgval_t generic_box(jl_value_t *targ, jl_value_t *x, jl_codectx_t *ctx
         int last_depth = ctx->gc.argDepth;
         Value *arg1 = emit_boxed_rooted(targ, ctx).V;
         Value *arg2 = emit_boxed_rooted(x, ctx).V;
-        Value *func = prepare_call(runtime_func[box]);
+        Value *func = prepare_call(runtime_func[reinterpret]);
 #ifdef LLVM37
         Value *r = builder.CreateCall(func, {arg1, arg2});
 #else
@@ -932,7 +910,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
         f = fptosi_auto;
     unsigned expected_nargs = intrinsic_nargs[f];
     if (expected_nargs && expected_nargs != nargs) {
-        jl_errorf("intrinsic #%d: wrong number of arguments", f);
+        jl_errorf("intrinsic #%d %s: wrong number of arguments", f, jl_intrinsic_name((int)f));
     }
 
     switch (f) {
@@ -1554,35 +1532,12 @@ static Value *emit_untyped_intrinsic(intrinsic f, Value *x, Value *y, Value *z,
     return NULL;
 }
 
-typedef jl_value_t *(*intrinsic_call_1_arg)(jl_value_t*);
-typedef jl_value_t *(*intrinsic_call_2_arg)(jl_value_t*, jl_value_t*);
-typedef jl_value_t *(*intrinsic_call_3_arg)(jl_value_t*, jl_value_t*, jl_value_t*);
-#define jl_is_intrinsic(v)       jl_typeis(v,jl_intrinsic_type)
-
-JL_CALLABLE(jl_f_intrinsic_call)
-{
-    JL_NARGSV(intrinsic_call, 1);
-    JL_TYPECHK(intrinsic_call, intrinsic, args[0]);
-    intrinsic f = (intrinsic)*(uint32_t*)jl_data_ptr(args[0]);
-    if (f == fptoui && nargs == 1)
-        f = fptoui_auto;
-    if (f == fptosi && nargs == 1)
-        f = fptosi_auto;
-    unsigned fargs = intrinsic_nargs[f];
-    JL_NARGS(intrinsic_call, 1 + fargs, 1 + fargs);
-    switch (fargs) {
-        case 1:
-            return ((intrinsic_call_1_arg)runtime_fp[f])(args[1]);
-        case 2:
-            return ((intrinsic_call_2_arg)runtime_fp[f])(args[1], args[2]);
-        case 3:
-            return ((intrinsic_call_3_arg)runtime_fp[f])(args[1], args[2], args[3]);
-        default:
-            assert(0 && "unexpected number of arguments to an intrinsic function");
-    }
-    abort();
-}
+#define BOX_F(ct,jl_ct)                                                    \
+    box_##ct##_func = boxfunc_llvm(ft1arg(T_pjlvalue, T_##jl_ct),     \
+                                   "jl_box_"#ct, (void*)&jl_box_##ct, m);
 
+#define SBOX_F(ct,jl_ct) BOX_F(ct,jl_ct); box_##ct##_func->addAttribute(1, Attribute::SExt);
+#define UBOX_F(ct,jl_ct) BOX_F(ct,jl_ct); box_##ct##_func->addAttribute(1, Attribute::ZExt);
 
 static Function *boxfunc_llvm(FunctionType *ft, const std::string &cname,
                               void *addr, Module *m)
@@ -1607,150 +1562,3 @@ static FunctionType *ft2arg(Type *ret, Type *arg1, Type *arg2)
     args2.push_back(arg2);
     return FunctionType::get(ret, args2, false);
 }
-
-#define BOX_F(ct,jl_ct)                                                    \
-    box_##ct##_func = boxfunc_llvm(ft1arg(T_pjlvalue, T_##jl_ct),     \
-                                   "jl_box_"#ct, (void*)&jl_box_##ct, m);
-
-#define SBOX_F(ct,jl_ct) BOX_F(ct,jl_ct); box_##ct##_func->addAttribute(1, Attribute::SExt);
-#define UBOX_F(ct,jl_ct) BOX_F(ct,jl_ct); box_##ct##_func->addAttribute(1, Attribute::ZExt);
-
-static void add_intrinsic(jl_module_t *m, const std::string &name, intrinsic f)
-{
-    jl_value_t *i = jl_box32(jl_intrinsic_type, (int32_t)f);
-    jl_sym_t *sym = jl_symbol(const_cast<char*>(name.c_str()));
-    jl_set_const(m, sym, i);
-    jl_module_export(m, sym);
-}
-
-#define ADD_I(name) add_intrinsic(inm, #name, name)
-
-extern "C" void jl_init_intrinsic_functions(void)
-{
-    jl_module_t *inm = jl_new_module(jl_symbol("Intrinsics"));
-    inm->parent = jl_core_module;
-    jl_set_const(jl_core_module, jl_symbol("Intrinsics"), (jl_value_t*)inm);
-
-    ADD_I(box); ADD_I(unbox);
-    ADD_I(neg_int); ADD_I(add_int); ADD_I(sub_int); ADD_I(mul_int);
-    ADD_I(sdiv_int); ADD_I(udiv_int); ADD_I(srem_int); ADD_I(urem_int);
-    ADD_I(smod_int);
-    ADD_I(neg_float); ADD_I(add_float); ADD_I(sub_float); ADD_I(mul_float);
-    ADD_I(div_float); ADD_I(rem_float); ADD_I(fma_float); ADD_I(muladd_float);
-    ADD_I(neg_float_fast); ADD_I(add_float_fast); ADD_I(sub_float_fast);
-    ADD_I(mul_float_fast); ADD_I(div_float_fast); ADD_I(rem_float_fast);
-    ADD_I(eq_int); ADD_I(ne_int);
-    ADD_I(slt_int); ADD_I(ult_int);
-    ADD_I(sle_int); ADD_I(ule_int);
-    ADD_I(eq_float); ADD_I(ne_float);
-    ADD_I(lt_float); ADD_I(le_float);
-    ADD_I(eq_float_fast); ADD_I(ne_float_fast);
-    ADD_I(lt_float_fast); ADD_I(le_float_fast);
-    ADD_I(fpiseq); ADD_I(fpislt);
-    ADD_I(and_int); ADD_I(or_int); ADD_I(xor_int); ADD_I(not_int);
-    ADD_I(shl_int); ADD_I(lshr_int); ADD_I(ashr_int); ADD_I(bswap_int);
-    ADD_I(ctpop_int); ADD_I(ctlz_int); ADD_I(cttz_int);
-    ADD_I(sext_int); ADD_I(zext_int); ADD_I(trunc_int);
-    ADD_I(fptoui); ADD_I(fptosi);
-    ADD_I(uitofp); ADD_I(sitofp);
-    ADD_I(fptrunc); ADD_I(fpext);
-    ADD_I(abs_float); ADD_I(copysign_float);
-    ADD_I(flipsign_int); ADD_I(select_value);
-    ADD_I(ceil_llvm); ADD_I(floor_llvm); ADD_I(trunc_llvm); ADD_I(rint_llvm);
-    ADD_I(sqrt_llvm); ADD_I(powi_llvm);
-    ADD_I(sqrt_llvm_fast);
-    ADD_I(pointerref); ADD_I(pointerset);
-    ADD_I(checked_sadd); ADD_I(checked_uadd);
-    ADD_I(checked_ssub); ADD_I(checked_usub);
-    ADD_I(checked_smul); ADD_I(checked_umul);
-    ADD_I(checked_fptosi); ADD_I(checked_fptoui);
-    ADD_I(checked_trunc_sint);
-    ADD_I(checked_trunc_uint);
-    ADD_I(check_top_bit);
-    ADD_I(nan_dom_err);
-    //ADD_I(fptosi_auto); ADD_I(fptoui_auto); // these intrinsics are "hidden" in fpto*i
-    ADD_I(ccall); ADD_I(cglobal);
-    ADD_I(llvmcall);
-
-    jl_set_const(inm, jl_symbol("intrinsic_call"),
-            (jl_value_t*)jl_new_closure(jl_f_intrinsic_call, (jl_value_t*)jl_symbol("intrinsic_call"), NULL));
-}
-#undef ADD_I
-
-static void add_intrinsic_to_codegen(Module *m, const std::string &name, intrinsic f,
-        unsigned nargs, std::vector<Type *> args, void *pfunc) {
-    Function *func = Function::Create(FunctionType::get(T_pjlvalue, args, false),
-                                      Function::ExternalLinkage, name, m);
-    runtime_func[f] = func;
-    add_named_global(func, pfunc);
-    intrinsic_nargs[f] = nargs;
-    runtime_fp[f] = pfunc;
-}
-
-static void add_intrinsic_to_codegen(intrinsic alias, intrinsic base)
-{
-    runtime_func[alias] = runtime_func[base];
-    intrinsic_nargs[alias] = intrinsic_nargs[base];
-    runtime_fp[alias] = runtime_fp[base];
-}
-
-#define ADD_I(name, nargs) add_intrinsic_to_codegen(m, "jl_" #name, name, nargs, args##nargs, (void*)&jl_##name)
-#define ALIAS(alias, base) add_intrinsic_to_codegen(alias, base)
-
-static void jl_init_intrinsic_functions_codegen(Module *m)
-{
-    std::vector<Type *> args1(0);
-    args1.push_back(T_pjlvalue);
-    std::vector<Type *> args2(0);
-    args2.push_back(T_pjlvalue);
-    args2.push_back(T_pjlvalue);
-    std::vector<Type *> args3(0);
-    args3.push_back(T_pjlvalue);
-    args3.push_back(T_pjlvalue);
-    args3.push_back(T_pjlvalue);
-
-    add_intrinsic_to_codegen(m, "jl_reinterpret", box,
-        2, args2, (void*)&jl_reinterpret);
-    ALIAS(unbox, box);
-    ADD_I(neg_int, 1); ADD_I(add_int, 2); ADD_I(sub_int, 2); ADD_I(mul_int, 2);
-    ADD_I(sdiv_int, 2); ADD_I(udiv_int, 2); ADD_I(srem_int, 2); ADD_I(urem_int, 2);
-    ADD_I(smod_int, 2);
-    ADD_I(neg_float, 1); ADD_I(add_float, 2); ADD_I(sub_float, 2); ADD_I(mul_float, 2);
-    ADD_I(div_float, 2); ADD_I(rem_float, 2); ADD_I(fma_float, 3); ADD_I(muladd_float, 3);
-    ALIAS(neg_float_fast, neg_float); ALIAS(add_float_fast, add_float); ALIAS(sub_float_fast, sub_float);
-    ALIAS(mul_float_fast, mul_float); ALIAS(div_float_fast, div_float); ALIAS(rem_float_fast, rem_float);
-    ADD_I(eq_int, 2); ADD_I(ne_int, 2);
-    ADD_I(slt_int, 2); ADD_I(ult_int, 2);
-    ADD_I(sle_int, 2); ADD_I(ule_int, 2);
-    ADD_I(eq_float, 2); ADD_I(ne_float, 2);
-    ADD_I(lt_float, 2); ADD_I(le_float, 2);
-    ALIAS(eq_float_fast, eq_float); ALIAS(ne_float_fast, ne_float);
-    ALIAS(lt_float_fast, lt_float); ALIAS(le_float_fast, le_float);
-    ADD_I(fpiseq, 2); ADD_I(fpislt, 2);
-    ADD_I(and_int, 2); ADD_I(or_int, 2); ADD_I(xor_int, 2); ADD_I(not_int, 1);
-    ADD_I(shl_int, 2); ADD_I(lshr_int, 2); ADD_I(ashr_int, 2); ADD_I(bswap_int, 1);
-    ADD_I(ctpop_int, 1); ADD_I(ctlz_int, 1); ADD_I(cttz_int, 1);
-    ADD_I(sext_int, 2); ADD_I(zext_int, 2); ADD_I(trunc_int, 2);
-    ADD_I(fptoui, 2); ADD_I(fptosi, 2);
-    ADD_I(uitofp, 2); ADD_I(sitofp, 2);
-    ADD_I(fptrunc, 2); ADD_I(fpext, 2);
-    ADD_I(abs_float, 1); ADD_I(copysign_float, 2);
-    ADD_I(flipsign_int, 2); ADD_I(select_value, 3);
-    ADD_I(ceil_llvm, 1); ADD_I(floor_llvm, 1); ADD_I(trunc_llvm, 1); ADD_I(rint_llvm, 1);
-    ADD_I(sqrt_llvm, 1); ADD_I(powi_llvm, 2);
-    ALIAS(sqrt_llvm_fast, sqrt_llvm);
-    ADD_I(pointerref, 2); ADD_I(pointerset, 3);
-    ADD_I(checked_sadd, 2); ADD_I(checked_uadd, 2);
-    ADD_I(checked_ssub, 2); ADD_I(checked_usub, 2);
-    ADD_I(checked_smul, 2); ADD_I(checked_umul, 2);
-    ADD_I(checked_fptosi, 2); ADD_I(checked_fptoui, 2);
-    ADD_I(checked_trunc_sint, 2);
-    ADD_I(checked_trunc_uint, 2);
-    ADD_I(check_top_bit, 1);
-    ADD_I(nan_dom_err, 2);
-    ADD_I(fptosi_auto, 1); ADD_I(fptoui_auto, 1);
-
-}
-
-#undef ADD_I
-#undef ALIAS
diff --git a/src/intrinsics.h b/src/intrinsics.h
new file mode 100644
index 0000000000000..3692ec2d260e5
--- /dev/null
+++ b/src/intrinsics.h
@@ -0,0 +1,207 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#define INTRINSICS \
+    /*  wrap and unwrap */ \
+    ALIAS(box, reinterpret) \
+    ALIAS(unbox, reinterpret) \
+    /*  arithmetic */ \
+    ADD_I(neg_int, 1) \
+    ADD_I(add_int, 2) \
+    ADD_I(sub_int, 2) \
+    ADD_I(mul_int, 2) \
+    ADD_I(sdiv_int, 2) \
+    ADD_I(udiv_int, 2) \
+    ADD_I(srem_int, 2) \
+    ADD_I(urem_int, 2) \
+    ADD_I(smod_int, 2) \
+    ADD_I(neg_float, 1) \
+    ADD_I(add_float, 2) \
+    ADD_I(sub_float, 2) \
+    ADD_I(mul_float, 2) \
+    ADD_I(div_float, 2) \
+    ADD_I(rem_float, 2) \
+    ADD_I(fma_float, 3) \
+    ADD_I(muladd_float, 3) \
+    /*  fast arithmetic */ \
+    ALIAS(neg_float_fast, neg_float) \
+    ALIAS(add_float_fast, add_float) \
+    ALIAS(sub_float_fast, sub_float) \
+    ALIAS(mul_float_fast, mul_float) \
+    ALIAS(div_float_fast, div_float) \
+    ALIAS(rem_float_fast, rem_float) \
+    /*  same-type comparisons */ \
+    ADD_I(eq_int, 2) \
+    ADD_I(ne_int, 2) \
+    ADD_I(slt_int, 2) \
+    ADD_I(ult_int, 2) \
+    ADD_I(sle_int, 2) \
+    ADD_I(ule_int, 2) \
+    ADD_I(eq_float, 2) \
+    ADD_I(ne_float, 2) \
+    ADD_I(lt_float, 2) \
+    ADD_I(le_float, 2) \
+    ALIAS(eq_float_fast, eq_float) \
+    ALIAS(ne_float_fast, ne_float) \
+    ALIAS(lt_float_fast, lt_float) \
+    ALIAS(le_float_fast, le_float) \
+    ADD_I(fpiseq, 2) \
+    ADD_I(fpislt, 2) \
+    /*  bitwise operators */ \
+    ADD_I(and_int, 2) \
+    ADD_I(or_int, 2) \
+    ADD_I(xor_int, 2) \
+    ADD_I(not_int, 1) \
+    ADD_I(shl_int, 2) \
+    ADD_I(lshr_int, 2) \
+    ADD_I(ashr_int, 2) \
+    ADD_I(bswap_int, 1) \
+    ADD_I(ctpop_int, 1) \
+    ADD_I(ctlz_int, 1) \
+    ADD_I(cttz_int, 1) \
+    /*  conversion */ \
+    ADD_I(sext_int, 2) \
+    ADD_I(zext_int, 2) \
+    ADD_I(trunc_int, 2) \
+    ADD_I(fptoui, 2) \
+    ADD_I(fptosi, 2) \
+    ADD_I(uitofp, 2) \
+    ADD_I(sitofp, 2) \
+    ADD_I(fptrunc, 2) \
+    ADD_I(fpext, 2) \
+    /*  checked conversion */ \
+    ADD_I(checked_fptosi, 2) \
+    ADD_I(checked_fptoui, 2) \
+    ADD_I(checked_trunc_sint, 2) \
+    ADD_I(checked_trunc_uint, 2) \
+    ADD_I(check_top_bit, 1) \
+    /*  checked arithmetic */ \
+    ADD_I(checked_sadd, 2) \
+    ADD_I(checked_uadd, 2) \
+    ADD_I(checked_ssub, 2) \
+    ADD_I(checked_usub, 2) \
+    ADD_I(checked_smul, 2) \
+    ADD_I(checked_umul, 2) \
+    ADD_I(nan_dom_err, 2) \
+    /*  functions */ \
+    ADD_I(abs_float, 1) \
+    ADD_I(copysign_float, 2) \
+    ADD_I(flipsign_int, 2) \
+    ADD_I(select_value, 3) \
+    ADD_I(ceil_llvm, 1) \
+    ADD_I(floor_llvm, 1) \
+    ADD_I(trunc_llvm, 1) \
+    ADD_I(rint_llvm, 1) \
+    ADD_I(sqrt_llvm, 1) \
+    ADD_I(powi_llvm, 2) \
+    ALIAS(sqrt_llvm_fast, sqrt_llvm) \
+    /*  pointer access */ \
+    ADD_I(pointerref, 2) \
+    ADD_I(pointerset, 3) \
+    /* c interface */ \
+    ALIAS(ccall, ccall) \
+    ALIAS(cglobal, cglobal) \
+    ALIAS(llvmcall, llvmcall) \
+    /*  hidden intrinsics */ \
+    ADD_HIDDEN(fptoui_auto, 1) \
+    ADD_HIDDEN(fptosi_auto, 1)
+
+enum intrinsic {
+#define ADD_I(func, nargs) func,
+#define ADD_HIDDEN ADD_I
+#define ALIAS ADD_I
+    INTRINSICS
+#undef ADD_I
+#undef ADD_HIDDEN
+#undef ALIAS
+    num_intrinsics,
+    reinterpret = box
+};
+
+#ifdef __cplusplus
+extern "C"
+#endif
+const char* jl_intrinsic_name(int f)
+{
+    switch ((enum intrinsic)f) {
+    default: return "invalid";
+#define ADD_I(func, nargs) case func: return #func;
+#define ADD_HIDDEN ADD_I
+#define ALIAS ADD_I
+    INTRINSICS
+#undef ADD_I
+#undef ADD_HIDDEN
+#undef ALIAS
+    }
+}
+
+static void* runtime_fp[num_intrinsics];
+static unsigned intrinsic_nargs[num_intrinsics];
+
+typedef jl_value_t *(*intrinsic_call_1_arg)(jl_value_t*);
+typedef jl_value_t *(*intrinsic_call_2_arg)(jl_value_t*, jl_value_t*);
+typedef jl_value_t *(*intrinsic_call_3_arg)(jl_value_t*, jl_value_t*, jl_value_t*);
+#define jl_is_intrinsic(v)       jl_typeis(v,jl_intrinsic_type)
+
+#ifdef __cplusplus
+extern "C"
+#endif
+JL_CALLABLE(jl_f_intrinsic_call)
+{
+    JL_NARGSV(intrinsic_call, 1);
+    JL_TYPECHK(intrinsic_call, intrinsic, args[0]);
+    enum intrinsic f = (enum intrinsic)*(uint32_t*)jl_data_ptr(args[0]);
+    if (f == fptoui && nargs == 1)
+        f = fptoui_auto;
+    if (f == fptosi && nargs == 1)
+        f = fptosi_auto;
+    unsigned fargs = intrinsic_nargs[f];
+    JL_NARGS(intrinsic_call, 1 + fargs, 1 + fargs);
+    switch (fargs) {
+        case 1:
+            return ((intrinsic_call_1_arg)runtime_fp[f])(args[1]);
+        case 2:
+            return ((intrinsic_call_2_arg)runtime_fp[f])(args[1], args[2]);
+        case 3:
+            return ((intrinsic_call_3_arg)runtime_fp[f])(args[1], args[2], args[3]);
+        default:
+            assert(0 && "unexpected number of arguments to an intrinsic function");
+    }
+    abort();
+}
+
+static void add_intrinsic_properties(enum intrinsic f, unsigned nargs, void *pfunc)
+{
+    intrinsic_nargs[f] = nargs;
+    runtime_fp[f] = pfunc;
+}
+
+static void add_intrinsic(jl_module_t *inm, const char *name, enum intrinsic f)
+{
+    jl_value_t *i = jl_box32(jl_intrinsic_type, (int32_t)f);
+    jl_sym_t *sym = jl_symbol(name);
+    jl_set_const(inm, sym, i);
+    jl_module_export(inm, sym);
+}
+
+
+#ifdef __cplusplus
+extern "C"
+#endif
+void jl_init_intrinsic_functions()
+{
+    jl_module_t *inm = jl_new_module(jl_symbol("Intrinsics"));
+    inm->parent = jl_core_module;
+    jl_set_const(jl_core_module, jl_symbol("Intrinsics"), (jl_value_t*)inm);
+
+#define ADD_I(name, nargs) add_intrinsic(inm, #name, name); add_intrinsic_properties(name, nargs, (void*)&jl_##name);
+#define ADD_HIDDEN(name, nargs) add_intrinsic_properties(name, nargs, (void*)&jl_##name);
+#define ALIAS(alias, base) add_intrinsic(inm, #alias, alias); add_intrinsic_properties(alias, intrinsic_nargs[base], runtime_fp[base]);
+    ADD_HIDDEN(reinterpret, 2);
+    INTRINSICS
+#undef ADD_I
+#undef ADD_HIDDEN
+#undef ALIAS
+
+    jl_set_const(inm, jl_symbol("intrinsic_call"),
+            (jl_value_t*)jl_new_closure(jl_f_intrinsic_call, (jl_value_t*)jl_symbol("intrinsic_call"), NULL));
+}
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 86490022e2340..fe9f2ac488256 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -116,6 +116,7 @@ jl_value_t *jl_nth_slot_type(jl_tupletype_t *sig, size_t i);
 void jl_compute_field_offsets(jl_datatype_t *st);
 jl_array_t *jl_new_array_for_deserialization(jl_value_t *atype, uint32_t ndims, size_t *dims,
                                              int isunboxed, int elsz);
+DLLEXPORT jl_value_t *jl_new_box(jl_value_t *v);
 extern jl_array_t *jl_module_init_order;
 
 #ifdef JL_USE_INTEL_JITEVENTS
@@ -145,10 +146,6 @@ void jl_dump_objfile(char *fname, int jit_model, const char *sysimg_data, size_t
 int32_t jl_get_llvm_gv(jl_value_t *p);
 void jl_idtable_rehash(jl_array_t **pa, size_t newsz);
 
-#ifdef _OS_LINUX_
-DLLEXPORT void jl_read_sonames(void);
-#endif
-
 jl_lambda_info_t *jl_add_static_parameters(jl_lambda_info_t *l, jl_svec_t *sp);
 jl_function_t *jl_get_specialization(jl_function_t *f, jl_tupletype_t *types);
 jl_function_t *jl_module_get_initializer(jl_module_t *m);
@@ -214,6 +211,10 @@ extern uv_lib_t *jl_crtdll_handle;
 extern uv_lib_t *jl_winsock_handle;
 #endif
 
+uv_lib_t *jl_get_library(char *f_lib);
+DLLEXPORT void *jl_load_and_lookup(char *f_lib, char *f_name, uv_lib_t **hnd);
+
+
 // libuv wrappers:
 DLLEXPORT int jl_fs_rename(const char *src_path, const char *dst_path);
 
@@ -226,6 +227,7 @@ extern DLLEXPORT jl_value_t *jl_segv_exception;
 #endif
 
 // Runtime intrinsics //
+const char* jl_intrinsic_name(int f);
 
 DLLEXPORT jl_value_t *jl_reinterpret(jl_value_t *ty, jl_value_t *v);
 DLLEXPORT jl_value_t *jl_pointerref(jl_value_t *p, jl_value_t *i);
diff --git a/src/runtime_ccall.cpp b/src/runtime_ccall.cpp
new file mode 100644
index 0000000000000..f1623fc6ac08f
--- /dev/null
+++ b/src/runtime_ccall.cpp
@@ -0,0 +1,140 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
+#include <map>
+#include <string>
+#include "julia.h"
+#include "julia_internal.h"
+
+// --- library symbol lookup ---
+
+// map from "libX" to full soname "libX.so.ver"
+#if defined(__linux__) || defined(__FreeBSD__)
+static std::map<std::string, std::string> sonameMap;
+static bool got_sonames = false;
+
+static void jl_read_sonames(void)
+{
+    char *line=NULL;
+    size_t sz=0;
+#if defined(__linux__)
+    FILE *ldc = popen("/sbin/ldconfig -p", "r");
+#else
+    FILE *ldc = popen("/sbin/ldconfig -r", "r");
+#endif
+
+    while (!feof(ldc)) {
+        ssize_t n = getline(&line, &sz, ldc);
+        if (n == -1)
+            break;
+        if (n > 2 && isspace((unsigned char)line[0])) {
+#ifdef __linux__
+            int i = 0;
+            while (isspace((unsigned char)line[++i])) ;
+            char *name = &line[i];
+            char *dot = strstr(name, ".so");
+            i = 0;
+#else
+            char *name = strstr(line, ":-l");
+            if (name == NULL) continue;
+            strncpy(name, "lib", 3);
+            char *dot = strchr(name, '.');
+#endif
+
+            if (NULL == dot)
+                continue;
+
+#ifdef __linux__
+            // Detect if this entry is for the current architecture
+            while (!isspace((unsigned char)dot[++i])) ;
+            while (isspace((unsigned char)dot[++i])) ;
+            int j = i;
+            while (!isspace((unsigned char)dot[++j])) ;
+            char *arch = strstr(dot+i,"x86-64");
+            if (arch != NULL && arch < dot + j) {
+#ifdef _P32
+                continue;
+#endif
+            }
+            else {
+#ifdef _P64
+                continue;
+#endif
+            }
+#endif // __linux__
+
+            char *abslibpath = strrchr(line, ' ');
+            if (dot != NULL && abslibpath != NULL) {
+                std::string pfx(name, dot - name);
+                // Do not include ' ' in front and '\n' at the end
+                std::string soname(abslibpath+1, line+n-(abslibpath+1)-1);
+                sonameMap[pfx] = soname;
+            }
+        }
+    }
+
+    free(line);
+    pclose(ldc);
+}
+
+extern "C" DLLEXPORT const char *jl_lookup_soname(const char *pfx, size_t n)
+{
+    if (!got_sonames) {
+        jl_read_sonames();
+        got_sonames = true;
+    }
+    std::string str(pfx, n);
+    if (sonameMap.find(str) != sonameMap.end()) {
+        return sonameMap[str].c_str();
+    }
+    return NULL;
+}
+#endif
+
+// map from user-specified lib names to handles
+static std::map<std::string, uv_lib_t*> libMap;
+
+extern "C"
+uv_lib_t *jl_get_library(char *f_lib)
+{
+    uv_lib_t *hnd;
+#ifdef _OS_WINDOWS_
+    if ((intptr_t)f_lib == 1)
+        return jl_exe_handle;
+    if ((intptr_t)f_lib == 2)
+        return jl_dl_handle;
+#endif
+    if (f_lib == NULL)
+        return jl_RTLD_DEFAULT_handle;
+    hnd = libMap[f_lib];
+    if (hnd != NULL)
+        return hnd;
+    hnd = (uv_lib_t *) jl_load_dynamic_library(f_lib, JL_RTLD_DEFAULT);
+    if (hnd != NULL)
+        libMap[f_lib] = hnd;
+    return hnd;
+}
+
+extern "C" DLLEXPORT
+void *jl_load_and_lookup(char *f_lib, char *f_name, uv_lib_t **hnd)
+{
+    uv_lib_t *handle = *hnd;
+    if (!handle)
+        *hnd = handle = jl_get_library(f_lib);
+    void *ptr = jl_dlsym_e(handle, f_name);
+    if (!ptr)
+        jl_errorf("symbol \"%s\" could not be found: %s", f_name, uv_dlerror(handle));
+    return ptr;
+}
+
+// miscellany
+#include <llvm/Support/Host.h>
+extern "C" DLLEXPORT
+jl_value_t *jl_get_cpu_name(void)
+{
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 5
+    std::string HostCPUName = llvm::sys::getHostCPUName();
+#else
+    StringRef HostCPUName = llvm::sys::getHostCPUName();
+#endif
+    return jl_pchar_to_string(HostCPUName.data(), HostCPUName.size());
+}
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index bd2c296b99f87..a7910152bcd23 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -1,5 +1,5 @@
 // This file is a part of Julia. License is MIT: http://julialang.org/license
-//
+
 // This is in implementation of the Julia intrinsic functions against boxed types
 // excluding the c interface (ccall, cglobal, llvmcall)
 //
diff --git a/src/sys.c b/src/sys.c
index 4d88301b8aed3..df5d373501382 100644
--- a/src/sys.c
+++ b/src/sys.c
@@ -540,14 +540,12 @@ DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
 DLLEXPORT void jl_native_alignment(uint_t *int8align, uint_t *int16align, uint_t *int32align,
                                    uint_t *int64align, uint_t *float32align, uint_t *float64align)
 {
-    LLVMTargetDataRef tgtdata = LLVMCreateTargetData("");
-    *int8align = LLVMPreferredAlignmentOfType(tgtdata, LLVMInt8Type());
-    *int16align = LLVMPreferredAlignmentOfType(tgtdata, LLVMInt16Type());
-    *int32align = LLVMPreferredAlignmentOfType(tgtdata, LLVMInt32Type());
-    *int64align = LLVMPreferredAlignmentOfType(tgtdata, LLVMInt64Type());
-    *float32align = LLVMPreferredAlignmentOfType(tgtdata, LLVMFloatType());
-    *float64align = LLVMPreferredAlignmentOfType(tgtdata, LLVMDoubleType());
-    LLVMDisposeTargetData(tgtdata);
+    *int8align = __alignof(uint8_t);
+    *int16align = __alignof(uint16_t);
+    *int32align = __alignof(uint32_t);
+    *int64align = __alignof(uint64_t);
+    *float32align = __alignof(float);
+    *float64align = __alignof(double);
 }
 
 DLLEXPORT jl_value_t *jl_is_char_signed()

From d23dc8531fb335c4a7d4e1c463f67cba1d1efd41 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 7 Oct 2015 13:12:40 -0400
Subject: [PATCH 05/11] improve jl_ printing for functions

---
 src/builtins.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/builtins.c b/src/builtins.c
index 0dd497981bc6a..c4691dc1126d8 100644
--- a/src/builtins.c
+++ b/src/builtins.c
@@ -1400,12 +1400,14 @@ static size_t jl_static_show_x_(JL_STREAM *out, jl_value_t *v,
             n += jl_printf(out, "%s", jl_gf_name(v)->name);
         }
         else {
-            n += jl_printf(out, "#<function>");
+            n += jl_printf(out, "#<function ");
+            n += jl_static_show_x(out, (jl_value_t*)((jl_function_t*)v)->linfo, depth);
+            n += jl_printf(out, ">");
         }
     }
     else if (vt == jl_intrinsic_type) {
-        n += jl_printf(out, "#<intrinsic function %d>",
-                       *(uint32_t*)jl_data_ptr(v));
+        int f = *(uint32_t*)jl_data_ptr(v);
+        n += jl_printf(out, "#<intrinsic #%d %s>", f, jl_intrinsic_name(f));
     }
     else if (vt == jl_int64_type) {
         n += jl_printf(out, "%" PRId64, *(int64_t*)v);

From a2516aa040d12cb8e3a0459f224f0cd24073e5e0 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 13 Oct 2015 12:30:33 -0400
Subject: [PATCH 06/11] cleanup runtime_intrinsics DLLEXPORT and license

---
 src/APInt-C.cpp       |   8 ++-
 src/APInt-C.h         | 131 ++++++++++++++++++++++--------------------
 src/intrinsics.cpp    |  11 ++--
 src/runtime_ccall.cpp |   1 +
 4 files changed, 83 insertions(+), 68 deletions(-)

diff --git a/src/APInt-C.cpp b/src/APInt-C.cpp
index 16a4c3059493e..1558f5f816b5c 100644
--- a/src/APInt-C.cpp
+++ b/src/APInt-C.cpp
@@ -1,10 +1,14 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
+
 #include "llvm-version.h"
 #include <llvm/ADT/APInt.h>
 #include <llvm/ADT/APFloat.h>
 #include <llvm/Support/MathExtras.h>
 
-#define DLLEXPORT
-extern "C" DLLEXPORT void jl_error(const char *str);
+extern "C" {
+#include "APInt-C.h"
+DLLEXPORT void jl_error(const char *str);
+}
 
 using namespace llvm;
 
diff --git a/src/APInt-C.h b/src/APInt-C.h
index da7c08652d2a8..2dd97fa02e0a9 100644
--- a/src/APInt-C.h
+++ b/src/APInt-C.h
@@ -1,3 +1,4 @@
+// This file is a part of Julia. License is MIT: http://julialang.org/license
 
 #ifndef APINT_C_H
 #define APINT_C_H
@@ -5,70 +6,76 @@
 #ifdef __cplusplus
 extern "C" {
 #endif
+#include "dtypes.h"
+
+#ifdef LLVM_VERSION_MAJOR
+using llvm::integerPart;
+#else
 typedef void integerPart;
+#endif
 
-void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr);
-void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr);
-
-void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-
-void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr);
-
-int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pr);
-int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb);
-int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb);
-int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb);
-int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb);
-int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb);
-
-int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-
-unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa);
-unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa);
-unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa);
-unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa);
-unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa);
-
-void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-void LLVMTrunc(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-
-int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
-
-void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
-
-unsigned countTrailingZeros_8(uint8_t Val);
-unsigned countTrailingZeros_16(uint16_t Val);
-unsigned countTrailingZeros_32(uint32_t Val);
-unsigned countTrailingZeros_64(uint64_t Val);
-
-uint8_t getSwappedBytes_8(uint8_t Value); // no-op
-uint16_t getSwappedBytes_16(uint16_t Value);
-uint32_t getSwappedBytes_32(uint32_t Value);
-uint64_t getSwappedBytes_64(uint64_t Value);
+DLLEXPORT void LLVMNeg(unsigned numbits, integerPart *pa, integerPart *pr);
+DLLEXPORT void LLVMByteSwap(unsigned numbits, integerPart *pa, integerPart *pr);
+
+DLLEXPORT void LLVMAdd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMSub(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMMul(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMSDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMUDiv(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMSRem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMURem(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+
+DLLEXPORT void LLVMAnd(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMOr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMXor(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMShl(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMLShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMAShr(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void LLVMFlipAllBits(unsigned numbits, integerPart *pa, integerPart *pr);
+
+DLLEXPORT int LLVMICmpEQ(unsigned numbits, integerPart *pa, integerPart *pr);
+DLLEXPORT int LLVMICmpNE(unsigned numbits, integerPart *pa, integerPart *pb);
+DLLEXPORT int LLVMICmpSLT(unsigned numbits, integerPart *pa, integerPart *pb);
+DLLEXPORT int LLVMICmpULT(unsigned numbits, integerPart *pa, integerPart *pb);
+DLLEXPORT int LLVMICmpSLE(unsigned numbits, integerPart *pa, integerPart *pb);
+DLLEXPORT int LLVMICmpULE(unsigned numbits, integerPart *pa, integerPart *pb);
+
+DLLEXPORT int LLVMAdd_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT int LLVMAdd_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT int LLVMSub_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT int LLVMSub_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT int LLVMMul_sov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT int LLVMMul_uov(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+
+DLLEXPORT unsigned LLVMCountPopulation(unsigned numbits, integerPart *pa);
+DLLEXPORT unsigned LLVMCountTrailingOnes(unsigned numbits, integerPart *pa);
+DLLEXPORT unsigned LLVMCountTrailingZeros(unsigned numbits, integerPart *pa);
+DLLEXPORT unsigned LLVMCountLeadingOnes(unsigned numbits, integerPart *pa);
+DLLEXPORT unsigned LLVMCountLeadingZeros(unsigned numbits, integerPart *pa);
+
+DLLEXPORT void LLVMFPtoSI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT void LLVMFPtoUI(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT void LLVMSItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT void LLVMUItoFP(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT void LLVMSExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT void LLVMZExt(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT void LLVMTrunc(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+
+DLLEXPORT int LLVMFPtoSI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+DLLEXPORT int LLVMFPtoUI_exact(unsigned numbits, integerPart *pa, unsigned onumbits, integerPart *pr);
+
+DLLEXPORT void jl_LLVMSMod(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+DLLEXPORT void jl_LLVMFlipSign(unsigned numbits, integerPart *pa, integerPart *pb, integerPart *pr);
+
+DLLEXPORT unsigned countTrailingZeros_8(uint8_t Val);
+DLLEXPORT unsigned countTrailingZeros_16(uint16_t Val);
+DLLEXPORT unsigned countTrailingZeros_32(uint32_t Val);
+DLLEXPORT unsigned countTrailingZeros_64(uint64_t Val);
+
+//uint8_t getSwappedBytes_8(uint8_t Value); // no-op
+//uint16_t getSwappedBytes_16(uint16_t Value);
+//uint32_t getSwappedBytes_32(uint32_t Value);
+//uint64_t getSwappedBytes_64(uint64_t Value);
 
 
 #ifdef __cplusplus
diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index 88f7e1734d877..fe6a4baf2caa0 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -495,7 +495,10 @@ static jl_cgval_t generic_unbox(jl_value_t *targ, jl_value_t *x, jl_codectx_t *c
         }
         else {
             if (!jl_is_leaf_type(v.typ) && !jl_is_bitstype(v.typ)) {
-                return jl_cgval_t(); // TODO: XXX
+                // TODO: currently doesn't handle the case where the type of neither argument is understood at compile time
+                // since codegen has no idea what size it might have
+                jl_error("codegen: failed during evaluation of a call to unbox");
+                return jl_cgval_t();
             }
             nb = jl_datatype_size(v.typ);
             llvmt = staticeval_bitstype(v.typ);
@@ -515,7 +518,7 @@ static jl_cgval_t generic_unbox(jl_value_t *targ, jl_value_t *x, jl_codectx_t *c
 
     if (!jl_is_bitstype(bt)) {
         // TODO: to accept arbitrary types, replace this function with a call to llvm_type_rewrite
-        emit_error("reinterpret: expected bits type as first argument", ctx);
+        emit_error("unbox: expected bits type as first argument", ctx);
         return jl_cgval_t();
     }
 
@@ -536,7 +539,7 @@ static jl_cgval_t generic_unbox(jl_value_t *targ, jl_value_t *x, jl_codectx_t *c
     else {
         vx = v.V;
         if (!jl_is_bitstype(v.typ)) {
-            emit_error("reinterpret: expected bits type value for second argument", ctx);
+            emit_error("unbox: expected bits type value for second argument", ctx);
             return jl_cgval_t();
         }
     }
@@ -554,7 +557,7 @@ static jl_cgval_t generic_unbox(jl_value_t *targ, jl_value_t *x, jl_codectx_t *c
         if (vxt->getPrimitiveSizeInBits() != llvmt->getPrimitiveSizeInBits() &&
             !(vxt->isPointerTy() && llvmt->getPrimitiveSizeInBits() == sizeof(void*)*8) &&
             !(llvmt->isPointerTy() && vxt->getPrimitiveSizeInBits() == sizeof(void*)*8)) {
-            emit_error("box: argument is of incorrect size", ctx);
+            emit_error("unbox: argument is of incorrect size", ctx);
             return jl_cgval_t();
         }
         if (vxt->isPointerTy() && !llvmt->isPointerTy())
diff --git a/src/runtime_ccall.cpp b/src/runtime_ccall.cpp
index f1623fc6ac08f..81e43164c64bd 100644
--- a/src/runtime_ccall.cpp
+++ b/src/runtime_ccall.cpp
@@ -2,6 +2,7 @@
 
 #include <map>
 #include <string>
+#include <cstdio>
 #include "julia.h"
 #include "julia_internal.h"
 

From 897886ce84dd063aad394d339aa30304dd5fb89f Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Fri, 16 Oct 2015 19:03:22 -0400
Subject: [PATCH 07/11] remove gcc-isms from runtime-intrinsics code

---
 src/intrinsics.cpp       |  2 +-
 src/runtime_intrinsics.c | 13 ++++++++-----
 2 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/intrinsics.cpp b/src/intrinsics.cpp
index fe6a4baf2caa0..29f85674a447b 100644
--- a/src/intrinsics.cpp
+++ b/src/intrinsics.cpp
@@ -920,7 +920,7 @@ static jl_cgval_t emit_intrinsic(intrinsic f, jl_value_t **args, size_t nargs,
     case ccall: return emit_ccall(args, nargs, ctx);
     case cglobal: return emit_cglobal(args, nargs, ctx);
     case llvmcall: return emit_llvmcall(args, nargs, ctx);
-#if 0
+#if 0 // this section enables runtime-intrinsics (e.g. for testing), and disables their llvm counterparts
     default:
         int ldepth = ctx->gc.argDepth;
         Value *r;
diff --git a/src/runtime_intrinsics.c b/src/runtime_intrinsics.c
index a7910152bcd23..6b7d3edfe09ba 100644
--- a/src/runtime_intrinsics.c
+++ b/src/runtime_intrinsics.c
@@ -86,7 +86,7 @@ static inline unsigned int next_power_of_two(unsigned int val) {
 
 static inline char signbitbyte(void *a, unsigned bytes) {
     // sign bit of an signed number of n bytes, as a byte
-    return signbit(((signed char*)a)[bytes-1]) ? ~0 : 0;
+    return (((signed char*)a)[bytes - 1] < 0) ? ~0 : 0;
 }
 
 static inline char usignbitbyte(void *a, unsigned bytes) {
@@ -111,7 +111,9 @@ static inline unsigned select_by_size(unsigned sz)
     typedef intrinsic##_t select_##intrinsic##_t[6]; \
     static inline intrinsic##_t select_##intrinsic(unsigned sz, select_##intrinsic##_t list) \
     { \
-        return list[select_by_size(sz)] ?: list[0]; \
+        intrinsic##_t thunk = list[select_by_size(sz)]; \
+        if (!thunk) thunk = list[0]; \
+        return thunk; \
     }
 
 #define fp_select(a, func) \
@@ -289,7 +291,7 @@ static inline jl_value_t *jl_iintrinsic_1(jl_value_t *ty, jl_value_t *a, const c
         /* TODO: this memcpy assumes little-endian,
          * for big-endian, need to align the copy to the other end */ \
         memcpy(pa2, pa, isize);
-        memset(pa2 + isize, getsign(pa, isize), osize2 - isize);
+        memset((char*)pa2 + isize, getsign(pa, isize), osize2 - isize);
         pa = pa2;
     }
     jl_value_t *newv = lambda1(ty, pa, osize, osize2, list);
@@ -837,8 +839,9 @@ DLLEXPORT jl_value_t *jl_check_top_bit(jl_value_t *a)
 
 // checked arithmetic
 #define check_sadd(a,b) \
-        /* this test is a reduction of (b > 0) ? (a + b >= typemin(a)) : (a + b < typemin(a)) ==> overflow */ \
-        (b > 0) == (a >= (((typeof(a))1) << (8 * sizeof(a) - 1)) - b)
+        /* this test is a reduction of (b > 0) ? (a + b >= typemin(a)) : (a + b < typemin(a)) ==> overflow \
+         * where (a - a) == (typeof(a))0 */ \
+        (b > 0) == (a >= ((a - a + 1) << (8 * sizeof(a) - 1)) - b)
 checked_iintrinsic_fast(LLVMAdd_sov, check_sadd, add, checked_sadd,  )
 #define check_uadd(a,b) \
         /* this test checks for (a + b) > typemax(a) ==> overflow */ \

From 3edb24c2e5aecb30c9ab06c440e06a372dc13241 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 15 Oct 2015 18:03:42 -0400
Subject: [PATCH 08/11] keep TypeVar out of typeinf results, and add assertion

---
 base/inference.jl | 22 ++++++++++++++++------
 src/dump.c        |  1 +
 2 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/base/inference.jl b/base/inference.jl
index fc190395ebb30..3c2836a8968da 100644
--- a/base/inference.jl
+++ b/base/inference.jl
@@ -903,6 +903,9 @@ function abstract_call(f, fargs, argtypes::Vector{Any}, vtypes, sv::StaticVarInf
         end
     end
     rt = builtin_tfunction(f, fargs, Tuple{argtypes...}, vtypes, sv)
+    if isa(rt, TypeVar)
+        rt = rt.ub
+    end
     #print("=> ", rt, "\n")
     return rt
 end
@@ -1324,14 +1327,14 @@ function typeinf(linfo::LambdaStaticData,atypes::ANY,sparams::SimpleVector, def,
                     break
                 end
                 if isa(code,Type)
-                    curtype = code
+                    curtype = code::Type
                     # sometimes just a return type is stored here. if a full AST
                     # is not needed, we can return it.
                     if !needtree
                         return (nothing, code)
                     end
                 else
-                    curtype = ccall(:jl_ast_rettype, Any, (Any,Any), def, code)
+                    curtype = ccall(:jl_ast_rettype, Any, (Any,Any), def, code)::Type
                     return (code, curtype)
                 end
             end
@@ -1344,7 +1347,7 @@ function typeinf(linfo::LambdaStaticData,atypes::ANY,sparams::SimpleVector, def,
 
     (fulltree, result, rec) = typeinf_uncached(linfo, atypes, sparams, def, curtype, cop, true)
     if fulltree === ()
-        return (fulltree,result)
+        return (fulltree, result::Type)
     end
 
     if !redo
@@ -1373,7 +1376,7 @@ function typeinf(linfo::LambdaStaticData,atypes::ANY,sparams::SimpleVector, def,
         def.tfunc[tfunc_idx+1] = rec
     end
 
-    return (fulltree, result)
+    return (fulltree, result::Type)
 end
 
 typeinf_uncached(linfo, atypes::ANY, sparams::ANY; optimize=true) =
@@ -1487,14 +1490,21 @@ function typeinf_uncached(linfo::LambdaStaticData, atypes::ANY, sparams::SimpleV
             lastatype = lastatype.parameters[1]
             laty -= 1
         end
+        if isa(lastatype, TypeVar)
+            lastatype = lastatype.ub
+        end
         if laty > la
             laty = la
         end
         for i=1:laty
-            s[1][args[i]] = VarState(atypes.parameters[i],false)
+            atyp = atypes.parameters[i]
+            if isa(atyp, TypeVar)
+                atyp = atyp.ub
+            end
+            s[1][args[i]] = VarState(atyp, false)
         end
         for i=laty+1:la
-            s[1][args[i]] = VarState(lastatype,false)
+            s[1][args[i]] = VarState(lastatype, false)
         end
     elseif la != 0
         return ((), Bottom, false) # wrong number of arguments
diff --git a/src/dump.c b/src/dump.c
index da91aa1f86e95..84d57ea926d2d 100644
--- a/src/dump.c
+++ b/src/dump.c
@@ -1942,6 +1942,7 @@ DLLEXPORT jl_value_t *jl_ast_rettype(jl_lambda_info_t *li, jl_value_t *ast)
 {
     if (jl_is_expr(ast))
         return jl_lam_body((jl_expr_t*)ast)->etype;
+    assert(jl_is_array(ast));
     JL_SIGATOMIC_BEGIN();
     DUMP_MODES last_mode = mode;
     mode = MODE_AST;

From 6e100101742d7ba20a11e5d306fe9f1842336d21 Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 13 Oct 2015 21:15:24 -0400
Subject: [PATCH 09/11] avoid running and clearing the __init__ list when
 building output

---
 src/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/init.c b/src/init.c
index 05479077a80cb..71d5eb156eef6 100644
--- a/src/init.c
+++ b/src/init.c
@@ -604,7 +604,7 @@ void _julia_init(JL_IMAGE_SEARCH rel)
 
     jl_gc_enable(1);
 
-    if (jl_options.image_file) {
+    if (jl_options.image_file && (!jl_generating_output() || jl_options.incremental)) {
         jl_array_t *temp = jl_module_init_order;
         JL_GC_PUSH1(&temp);
         jl_module_init_order = NULL;

From 040cf46f99f863fb201881813ee92a36ec69bb3e Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Thu, 15 Oct 2015 18:25:42 -0400
Subject: [PATCH 10/11] store a minimal list for jl_module_init_order

reduces runtime lookup effort. but the real reason is that it is a workaround because of the replacing of Core.Inference
---
 src/init.c | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/init.c b/src/init.c
index 71d5eb156eef6..aea44e293045e 100644
--- a/src/init.c
+++ b/src/init.c
@@ -627,15 +627,29 @@ void jl_compile_all(void);
 
 static void julia_save()
 {
+    if (!jl_generating_output())
+        return;
+
     if (jl_options.compile_enabled == JL_OPTIONS_COMPILE_ALL)
         jl_compile_all();
 
-    if (jl_options.incremental) {
-        jl_array_t *worklist = jl_module_init_order;
-        if (!worklist) {
-            jl_printf(JL_STDERR, "WARNING: incremental output requested, but no modules defined during run\n");
-            return;
+    if (!jl_module_init_order) {
+        jl_printf(JL_STDERR, "WARNING: --output requested, but no modules defined during run\n");
+        return;
+    }
+
+    jl_array_t *worklist = jl_module_init_order;
+    JL_GC_PUSH1(&worklist);
+    jl_module_init_order = jl_alloc_cell_1d(0);
+    int i, l = jl_array_len(worklist);
+    for (i = 0; i < l; i++) {
+        jl_value_t *m = jl_arrayref(worklist, i);
+        if (jl_module_get_initializer((jl_module_t*)m)) {
+            jl_cell_1d_push(jl_module_init_order, m);
         }
+    }
+
+    if (jl_options.incremental) {
         if (jl_options.outputji)
             if (jl_save_incremental(jl_options.outputji, worklist))
                 jl_exit(1);
@@ -668,6 +682,7 @@ static void julia_save()
         if (jl_options.outputo)
             jl_dump_objfile((char*)jl_options.outputo, 0, (const char*)s->buf, s->size);
     }
+    JL_GC_POP();
 }
 
 jl_function_t *jl_typeinf_func=NULL;

From cda8b06d31466467d13d2a473de51a545d73781c Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 13 Oct 2015 16:56:16 -0400
Subject: [PATCH 11/11] simplify more functions signatures that use Intrinsics
 (for compile-all mode)

---
 base/c.jl | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/base/c.jl b/base/c.jl
index a32bdda4ae56d..adc159e65bf1f 100644
--- a/base/c.jl
+++ b/base/c.jl
@@ -2,7 +2,7 @@
 
 # definitions related to C interface
 
-import Core.Intrinsics: cglobal, box, unbox
+import Core.Intrinsics: cglobal, box
 
 const OS_NAME = ccall(:jl_get_OS_NAME, Any, ())
 
@@ -58,10 +58,10 @@ else
     bitstype 32 Cwstring
 end
 
-convert{T<:Union{Int8,UInt8}}(::Type{Cstring}, p::Ptr{T}) = box(Cstring, unbox(Ptr{T}, p))
-convert(::Type{Cwstring}, p::Ptr{Cwchar_t}) = box(Cwstring, unbox(Ptr{Cwchar_t}, p))
-convert{T<:Union{Int8,UInt8}}(::Type{Ptr{T}}, p::Cstring) = box(Ptr{T}, unbox(Cstring, p))
-convert(::Type{Ptr{Cwchar_t}}, p::Cwstring) = box(Ptr{Cwchar_t}, unbox(Cwstring, p))
+convert{T<:Union{Int8,UInt8}}(::Type{Cstring}, p::Ptr{T}) = box(Cstring, p)
+convert(::Type{Cwstring}, p::Ptr{Cwchar_t}) = box(Cwstring, p)
+convert{T<:Union{Int8,UInt8}}(::Type{Ptr{T}}, p::Cstring) = box(Ptr{T}, p)
+convert(::Type{Ptr{Cwchar_t}}, p::Cwstring) = box(Ptr{Cwchar_t}, p)
 
 # here, not in pointer.jl, to avoid bootstrapping problems in coreimg.jl
 pointer_to_string(p::Cstring, own::Bool=false) = pointer_to_string(convert(Ptr{UInt8}, p), own)