diff --git a/.appveyor.yml b/.appveyor.yml
index 352b3bc3aa915..bd02240ad1060 100644
--- a/.appveyor.yml
+++ b/.appveyor.yml
@@ -1,4 +1,9 @@
 environment:
+  # We don't want to do identical comdat folding as it messes up the ability to
+  # generate lossless backtraces in some cases. This is enabled by rustc by
+  # default so pass a flag to disable it to ensure our tests work ok.
+  RUSTFLAGS: -Clink-args=/OPT:NOICF
+
   matrix:
   - TARGET: x86_64-pc-windows-msvc
 
@@ -15,4 +20,5 @@ build: false
 
 test_script:
   - cargo test --target %TARGET%
+  - set RUST_BACKTRACE=1
   - cargo test --target %TARGET% --release
diff --git a/Cargo.toml b/Cargo.toml
index 0da061e71c809..87cd5dd14ca44 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -15,7 +15,7 @@ debug = true
 opt-level = 3
 
 [profile.bench]
-debug = 1
+debug = true
 opt-level = 3
 
 [dev-dependencies]
diff --git a/assert-instr/assert-instr-macro/src/lib.rs b/assert-instr/assert-instr-macro/src/lib.rs
index 1c4126149097a..9d7093a523223 100644
--- a/assert-instr/assert-instr-macro/src/lib.rs
+++ b/assert-instr/assert-instr-macro/src/lib.rs
@@ -44,7 +44,9 @@ pub fn assert_instr(attr: TokenStream, item: TokenStream) -> TokenStream {
         #[allow(non_snake_case)]
         {ignore}
         fn assert_instr_{name}() {{
-            ::assert_instr::assert({name} as usize, \"{instr}\");
+            ::assert_instr::assert({name} as usize,
+                                   \"{name}\",
+                                   \"{instr}\");
         }}
     ", name = name.as_str(), instr = instr.as_str(), ignore = ignore);
     let test: TokenStream = test.parse().unwrap();
diff --git a/assert-instr/src/lib.rs b/assert-instr/src/lib.rs
index 596668a8f59b4..ada7b8bc3fa0b 100644
--- a/assert-instr/src/lib.rs
+++ b/assert-instr/src/lib.rs
@@ -221,29 +221,39 @@ fn normalize(symbol: &str) -> String {
 ///
 /// This asserts that the function at `fnptr` contains the instruction
 /// `expected` provided.
-pub fn assert(fnptr: usize, expected: &str) {
+pub fn assert(fnptr: usize, fnname: &str, expected: &str) {
     // Translate this function pointer to a symbolic name that we'd have found
     // in the disassembly.
     let mut sym = None;
     backtrace::resolve(fnptr as *mut _, |name| {
         sym = name.name().and_then(|s| s.as_str()).map(normalize);
     });
-    let sym = match sym {
+
+    let functions = match sym.as_ref().and_then(|s| DISASSEMBLY.get(s)) {
         Some(s) => s,
-        None => panic!("failed to get symbol of function pointer: {}", fnptr),
+        None => {
+            if let Some(sym) = sym {
+                println!("assumed symbol name: `{}`", sym);
+            }
+            println!("maybe related functions");
+            for f in DISASSEMBLY.keys().filter(|k| k.contains(fnname)) {
+                println!("\t- {}", f);
+            }
+            panic!("failed to find disassembly of {:#x} ({})", fnptr, fnname);
+        }
     };
 
-    // Find our function in the list of all disassembled functions
-    let functions = &DISASSEMBLY.get(&sym)
-        .expect(&format!("failed to find disassembly of {}", sym));
     assert_eq!(functions.len(), 1);
     let function = &functions[0];
 
     // Look for `expected` as the first part of any instruction in this
     // function, returning if we do indeed find it.
     for instr in function.instrs.iter() {
+        // Gets the first instruction, e.g. tzcntl in tzcntl %rax,%rax
         if let Some(part) = instr.parts.get(0) {
-            if part == expected {
+            // Truncates the instruction with the length of the expected
+            // instruction: tzcntl => tzcnt and compares that.
+            if part.starts_with(expected) {
                 return
             }
         }
@@ -251,7 +261,7 @@ pub fn assert(fnptr: usize, expected: &str) {
 
     // Help debug by printing out the found disassembly, and then panic as we
     // didn't find the instruction.
-    println!("disassembly for {}: ", sym);
+    println!("disassembly for {}: ", sym.as_ref().unwrap());
     for (i, instr) in function.instrs.iter().enumerate() {
         print!("\t{:2}: ", i);
         for part in instr.parts.iter() {
@@ -261,4 +271,3 @@ pub fn assert(fnptr: usize, expected: &str) {
     }
     panic!("failed to find instruction `{}` in the disassembly", expected);
 }
-
diff --git a/src/arm/mod.rs b/src/arm/mod.rs
new file mode 100644
index 0000000000000..9472441ae4feb
--- /dev/null
+++ b/src/arm/mod.rs
@@ -0,0 +1,10 @@
+//! ARM intrinsics.
+pub use self::v6::*;
+pub use self::v7::*;
+#[cfg(target_arch = "aarch64")]
+pub use self::v8::*;
+
+mod v6;
+mod v7;
+#[cfg(target_arch = "aarch64")]
+mod v8;
diff --git a/src/arm/v6.rs b/src/arm/v6.rs
new file mode 100644
index 0000000000000..95442b374f8cf
--- /dev/null
+++ b/src/arm/v6.rs
@@ -0,0 +1,25 @@
+//! ARMv6 intrinsics.
+//!
+//! The reference is [ARMv6-M Architecture Reference
+//! Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0419c/index.html).
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u8(x: u8) -> u8 {
+    x.swap_bytes() as u8
+}
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u16(x: u16) -> u16 {
+    x.swap_bytes() as u16
+}
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u32(x: u32) -> u32 {
+    x.swap_bytes() as u32
+}
diff --git a/src/arm/v7.rs b/src/arm/v7.rs
new file mode 100644
index 0000000000000..1052b8477a923
--- /dev/null
+++ b/src/arm/v7.rs
@@ -0,0 +1,40 @@
+//! ARMv7 intrinsics.
+//!
+//! The reference is [ARMv7-M Architecture Reference Manual (Issue
+//! E.b)](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0403e.b/index.html).
+
+pub use super::v6::*;
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u8(x: u8) -> u8 {
+    x.leading_zeros() as u8
+}
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u16(x: u16) -> u16 {
+    x.leading_zeros() as u16
+}
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u32(x: u32) -> u32 {
+    x.leading_zeros() as u32
+}
+
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.bitreverse.i32"]
+    fn rbit_u32(i: i32) -> i32;
+}
+
+/// Reverse the bit order.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn _rbit_u32(x: u32) -> u32 {
+    unsafe { rbit_u32(x as i32) as u32 }
+}
diff --git a/src/arm/v8.rs b/src/arm/v8.rs
new file mode 100644
index 0000000000000..e49ca4fe1f25e
--- /dev/null
+++ b/src/arm/v8.rs
@@ -0,0 +1,54 @@
+//! ARMv8 intrinsics.
+//!
+//! The reference is [ARMv8-A Reference Manual](http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.ddi0487a.k_10775/index.html).
+
+pub use super::v7::*;
+
+/// Reverse the order of the bytes.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rev))]
+pub fn _rev_u64(x: u64) -> u64 {
+    x.swap_bytes() as u64
+}
+
+/// Count Leading Zeros.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(clz))]
+pub fn _clz_u64(x: u64) -> u64 {
+    x.leading_zeros() as u64
+}
+
+#[allow(dead_code)]
+extern "C" {
+    #[link_name="llvm.bitreverse.i64"]
+    fn rbit_u64(i: i64) -> i64;
+}
+
+/// Reverse the bit order.
+#[inline(always)]
+#[cfg_attr(test, assert_instr(rbit))]
+pub fn _rbit_u64(x: u64) -> u64 {
+    unsafe { rbit_u64(x as i64) as u64 }
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline(always)]
+// LLVM Bug (should be cls): https://bugs.llvm.org/show_bug.cgi?id=31802
+#[cfg_attr(test, assert_instr(clz))] 
+pub fn _cls_u32(x: u32) -> u32 {
+    u32::leading_zeros(!x) as u32
+}
+
+/// Counts the leading most significant bits set.
+///
+/// When all bits of the operand are set it returns the size of the operand in
+/// bits.
+#[inline(always)]
+// LLVM Bug (should be cls): https://bugs.llvm.org/show_bug.cgi?id=31802
+#[cfg_attr(test, assert_instr(clz))] 
+pub fn _cls_u64(x: u64) -> u64 {
+    u64::leading_zeros(!x) as u64
+}
diff --git a/src/lib.rs b/src/lib.rs
index e2ec276884099..2e75c3e833d0f 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -20,6 +20,9 @@ pub mod simd {
 pub mod vendor {
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     pub use x86::*;
+
+    #[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+    pub use arm::*;
 }
 
 #[macro_use]
@@ -31,3 +34,6 @@ mod v512;
 mod v64;
 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
 mod x86;
+
+#[cfg(any(target_arch = "arm", target_arch = "aarch64"))]
+ mod arm;
diff --git a/src/x86/bmi2.rs b/src/x86/bmi2.rs
index 321df40777f13..67f8740399e43 100644
--- a/src/x86/bmi2.rs
+++ b/src/x86/bmi2.rs
@@ -2,7 +2,7 @@
 //!
 //! The reference is [Intel 64 and IA-32 Architectures Software Developer's
 //! Manual Volume 2: Instruction Set Reference,
-//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf).
+//! A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectu res-software-developer-instruction-set-reference-manual-325383.pdf).
 //!
 //! [Wikipedia](https://en.wikipedia.org/wiki/Bit_Manipulation_Instruction_Sets#BMI2_.28Bit_Manipulation_Instruction_Set_2.29)
 //! provides a quick overview of the available instructions.
@@ -15,6 +15,8 @@ use assert_instr::assert_instr;
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
 #[inline(always)]
+// LLVM BUG (should be mulxl): https://bugs.llvm.org/show_bug.cgi?id=34232
+#[cfg_attr(test, assert_instr(imul))]
 #[target_feature = "+bmi2"]
 pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
     let result: u64 = (a as u64) * (b as u64);
@@ -27,6 +29,7 @@ pub fn _mulx_u32(a: u32, b: u32) -> (u32, u32) {
 /// Unsigned multiplication of `a` with `b` returning a pair `(lo, hi)` with
 /// the low half and the high half of the result.
 #[inline(always)]
+#[cfg_attr(test, assert_instr(mulx))]
 #[target_feature = "+bmi2"]
 pub fn _mulx_u64(a: u64, b: u64) -> (u64, u64) {
     let result: u128 = (a as u128) * (b as u128);