diff --git a/Cargo.lock b/Cargo.lock
index 6f8aa6fa7..361eac726 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -151,6 +151,7 @@ checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3"
 name = "md-5"
 version = "0.10.5"
 dependencies = [
+ "cfg-if",
  "digest",
  "hex-literal",
  "md5-asm",
diff --git a/md5/Cargo.toml b/md5/Cargo.toml
index 847530889..e8f89ad1d 100644
--- a/md5/Cargo.toml
+++ b/md5/Cargo.toml
@@ -16,6 +16,7 @@ name = "md5"
 
 [dependencies]
 digest = "0.10.7"
+cfg-if = "1.0"
 
 [target.'cfg(any(target_arch = "x86", target_arch = "x86_64"))'.dependencies]
 md5-asm = { version = "0.5", optional = true }
@@ -28,4 +29,8 @@ hex-literal = "0.2.2"
 default = ["std"]
 std = ["digest/std"]
 asm = ["md5-asm"] # WARNING: this feature SHOULD NOT be enabled by library crates
+# Use assembly backend for LoongArch64 targets
+# WARNING: Bumps MSRV to 1.72. This feature SHOULD NOT be enabled by library crates
+loongarch64_asm = []
 oid = ["digest/oid"] # Enable OID support. WARNING: Bumps MSRV to 1.57
+force-soft = [] # Force software implementation
diff --git a/md5/src/compress.rs b/md5/src/compress.rs
index 46857038e..c0bcd816a 100644
--- a/md5/src/compress.rs
+++ b/md5/src/compress.rs
@@ -1,165 +1,14 @@
-#![allow(clippy::many_single_char_names, clippy::unreadable_literal)]
-use core::convert::TryInto;
-
-const RC: [u32; 64] = [
-    // round 1
-    0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
-    0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
-    // round 2
-    0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
-    0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
-    // round 3
-    0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
-    0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
-    // round 4
-    0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
-    0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
-];
-
-#[inline(always)]
-fn op_f(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
-    ((x & y) | (!x & z))
-        .wrapping_add(w)
-        .wrapping_add(m)
-        .wrapping_add(c)
-        .rotate_left(s)
-        .wrapping_add(x)
-}
-#[inline(always)]
-fn op_g(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
-    ((x & z) | (y & !z))
-        .wrapping_add(w)
-        .wrapping_add(m)
-        .wrapping_add(c)
-        .rotate_left(s)
-        .wrapping_add(x)
-}
-
-#[inline(always)]
-fn op_h(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
-    (x ^ y ^ z)
-        .wrapping_add(w)
-        .wrapping_add(m)
-        .wrapping_add(c)
-        .rotate_left(s)
-        .wrapping_add(x)
-}
-
-#[inline(always)]
-fn op_i(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
-    (y ^ (x | !z))
-        .wrapping_add(w)
-        .wrapping_add(m)
-        .wrapping_add(c)
-        .rotate_left(s)
-        .wrapping_add(x)
-}
-
-#[inline]
-pub fn compress_block(state: &mut [u32; 4], input: &[u8; 64]) {
-    let mut a = state[0];
-    let mut b = state[1];
-    let mut c = state[2];
-    let mut d = state[3];
-
-    let mut data = [0u32; 16];
-    for (o, chunk) in data.iter_mut().zip(input.chunks_exact(4)) {
-        *o = u32::from_le_bytes(chunk.try_into().unwrap());
-    }
-
-    // round 1
-    a = op_f(a, b, c, d, data[0], RC[0], 7);
-    d = op_f(d, a, b, c, data[1], RC[1], 12);
-    c = op_f(c, d, a, b, data[2], RC[2], 17);
-    b = op_f(b, c, d, a, data[3], RC[3], 22);
-
-    a = op_f(a, b, c, d, data[4], RC[4], 7);
-    d = op_f(d, a, b, c, data[5], RC[5], 12);
-    c = op_f(c, d, a, b, data[6], RC[6], 17);
-    b = op_f(b, c, d, a, data[7], RC[7], 22);
-
-    a = op_f(a, b, c, d, data[8], RC[8], 7);
-    d = op_f(d, a, b, c, data[9], RC[9], 12);
-    c = op_f(c, d, a, b, data[10], RC[10], 17);
-    b = op_f(b, c, d, a, data[11], RC[11], 22);
-
-    a = op_f(a, b, c, d, data[12], RC[12], 7);
-    d = op_f(d, a, b, c, data[13], RC[13], 12);
-    c = op_f(c, d, a, b, data[14], RC[14], 17);
-    b = op_f(b, c, d, a, data[15], RC[15], 22);
-
-    // round 2
-    a = op_g(a, b, c, d, data[1], RC[16], 5);
-    d = op_g(d, a, b, c, data[6], RC[17], 9);
-    c = op_g(c, d, a, b, data[11], RC[18], 14);
-    b = op_g(b, c, d, a, data[0], RC[19], 20);
-
-    a = op_g(a, b, c, d, data[5], RC[20], 5);
-    d = op_g(d, a, b, c, data[10], RC[21], 9);
-    c = op_g(c, d, a, b, data[15], RC[22], 14);
-    b = op_g(b, c, d, a, data[4], RC[23], 20);
-
-    a = op_g(a, b, c, d, data[9], RC[24], 5);
-    d = op_g(d, a, b, c, data[14], RC[25], 9);
-    c = op_g(c, d, a, b, data[3], RC[26], 14);
-    b = op_g(b, c, d, a, data[8], RC[27], 20);
-
-    a = op_g(a, b, c, d, data[13], RC[28], 5);
-    d = op_g(d, a, b, c, data[2], RC[29], 9);
-    c = op_g(c, d, a, b, data[7], RC[30], 14);
-    b = op_g(b, c, d, a, data[12], RC[31], 20);
-
-    // round 3
-    a = op_h(a, b, c, d, data[5], RC[32], 4);
-    d = op_h(d, a, b, c, data[8], RC[33], 11);
-    c = op_h(c, d, a, b, data[11], RC[34], 16);
-    b = op_h(b, c, d, a, data[14], RC[35], 23);
-
-    a = op_h(a, b, c, d, data[1], RC[36], 4);
-    d = op_h(d, a, b, c, data[4], RC[37], 11);
-    c = op_h(c, d, a, b, data[7], RC[38], 16);
-    b = op_h(b, c, d, a, data[10], RC[39], 23);
-
-    a = op_h(a, b, c, d, data[13], RC[40], 4);
-    d = op_h(d, a, b, c, data[0], RC[41], 11);
-    c = op_h(c, d, a, b, data[3], RC[42], 16);
-    b = op_h(b, c, d, a, data[6], RC[43], 23);
-
-    a = op_h(a, b, c, d, data[9], RC[44], 4);
-    d = op_h(d, a, b, c, data[12], RC[45], 11);
-    c = op_h(c, d, a, b, data[15], RC[46], 16);
-    b = op_h(b, c, d, a, data[2], RC[47], 23);
-
-    // round 4
-    a = op_i(a, b, c, d, data[0], RC[48], 6);
-    d = op_i(d, a, b, c, data[7], RC[49], 10);
-    c = op_i(c, d, a, b, data[14], RC[50], 15);
-    b = op_i(b, c, d, a, data[5], RC[51], 21);
-
-    a = op_i(a, b, c, d, data[12], RC[52], 6);
-    d = op_i(d, a, b, c, data[3], RC[53], 10);
-    c = op_i(c, d, a, b, data[10], RC[54], 15);
-    b = op_i(b, c, d, a, data[1], RC[55], 21);
-
-    a = op_i(a, b, c, d, data[8], RC[56], 6);
-    d = op_i(d, a, b, c, data[15], RC[57], 10);
-    c = op_i(c, d, a, b, data[6], RC[58], 15);
-    b = op_i(b, c, d, a, data[13], RC[59], 21);
-
-    a = op_i(a, b, c, d, data[4], RC[60], 6);
-    d = op_i(d, a, b, c, data[11], RC[61], 10);
-    c = op_i(c, d, a, b, data[2], RC[62], 15);
-    b = op_i(b, c, d, a, data[9], RC[63], 21);
-
-    state[0] = state[0].wrapping_add(a);
-    state[1] = state[1].wrapping_add(b);
-    state[2] = state[2].wrapping_add(c);
-    state[3] = state[3].wrapping_add(d);
-}
-
-#[inline]
-pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
-    for block in blocks {
-        compress_block(state, block)
+cfg_if::cfg_if! {
+    if #[cfg(feature = "force-soft")] {
+        mod soft;
+        pub use soft::compress;
+    } else if #[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))] {
+        pub use md5_asm::compress;
+    } else if #[cfg(all(feature = "loongarch64_asm", target_arch = "loongarch64"))] {
+        mod loongarch64_asm;
+        pub use loongarch64_asm::compress;
+    } else {
+        mod soft;
+        pub use soft::compress;
     }
 }
diff --git a/md5/src/compress/consts.rs b/md5/src/compress/consts.rs
new file mode 100644
index 000000000..2b6d13042
--- /dev/null
+++ b/md5/src/compress/consts.rs
@@ -0,0 +1,14 @@
+pub const RC: [u32; 64] = [
+    // round 1
+    0xd76aa478, 0xe8c7b756, 0x242070db, 0xc1bdceee, 0xf57c0faf, 0x4787c62a, 0xa8304613, 0xfd469501,
+    0x698098d8, 0x8b44f7af, 0xffff5bb1, 0x895cd7be, 0x6b901122, 0xfd987193, 0xa679438e, 0x49b40821,
+    // round 2
+    0xf61e2562, 0xc040b340, 0x265e5a51, 0xe9b6c7aa, 0xd62f105d, 0x02441453, 0xd8a1e681, 0xe7d3fbc8,
+    0x21e1cde6, 0xc33707d6, 0xf4d50d87, 0x455a14ed, 0xa9e3e905, 0xfcefa3f8, 0x676f02d9, 0x8d2a4c8a,
+    // round 3
+    0xfffa3942, 0x8771f681, 0x6d9d6122, 0xfde5380c, 0xa4beea44, 0x4bdecfa9, 0xf6bb4b60, 0xbebfbc70,
+    0x289b7ec6, 0xeaa127fa, 0xd4ef3085, 0x04881d05, 0xd9d4d039, 0xe6db99e5, 0x1fa27cf8, 0xc4ac5665,
+    // round 4
+    0xf4292244, 0x432aff97, 0xab9423a7, 0xfc93a039, 0x655b59c3, 0x8f0ccc92, 0xffeff47d, 0x85845dd1,
+    0x6fa87e4f, 0xfe2ce6e0, 0xa3014314, 0x4e0811a1, 0xf7537e82, 0xbd3af235, 0x2ad7d2bb, 0xeb86d391,
+];
diff --git a/md5/src/compress/loongarch64_asm.rs b/md5/src/compress/loongarch64_asm.rs
new file mode 100644
index 000000000..2af53af35
--- /dev/null
+++ b/md5/src/compress/loongarch64_asm.rs
@@ -0,0 +1,182 @@
+//! LoongArch64 assembly backend
+
+use core::arch::asm;
+
+#[path = "consts.rs"]
+mod consts;
+use consts::*;
+
+macro_rules! c {
+    ($($l:expr)*) => {
+        concat!($($l ,)*)
+    };
+}
+
+macro_rules! round0 {
+    ($a:literal, $b:literal, $c:literal, $d:literal, $k:literal, $s:literal, $i:literal) => {
+        c!(
+            "xor    $t4," $c "," $d ";"
+            "and    $t4, $t4," $b ";"
+            "xor    $t4, $t4," $d ";"
+            roundtail!($a, $b, $k, $s, $i)
+        )
+    }
+}
+
+macro_rules! round1 {
+    ($a:literal, $b:literal, $c:literal, $d:literal, $k:literal, $s:literal, $i:literal) => {
+        c!(
+            "andn    $t4," $c "," $d ";"
+            "and     $t5," $d "," $b ";"
+            "or      $t4, $t4, $t5;"
+            roundtail!($a, $b, $k, $s, $i)
+        )
+    }
+}
+
+macro_rules! round2 {
+    ($a:literal, $b:literal, $c:literal, $d:literal, $k:literal, $s:literal, $i:literal) => {
+        c!(
+            "xor    $t4," $c "," $d ";"
+            "xor    $t4, $t4," $b ";"
+            roundtail!($a, $b, $k, $s, $i)
+        )
+    }
+}
+
+macro_rules! round3 {
+    ($a:literal, $b:literal, $c:literal, $d:literal, $k:literal, $s:literal, $i:literal) => {
+        c!(
+            "orn    $t4," $b "," $d ";"
+            "xor    $t4, $t4," $c ";"
+            roundtail!($a, $b, $k, $s, $i)
+        )
+    }
+}
+
+macro_rules! roundtail {
+    ($a:literal, $b:literal, $k:literal, $s:literal, $i:literal) => {
+        c!(
+            "ld.w       $t5, $a3," $i " * 4;"
+            "ld.w       $t6, $a1," $k " * 4;"
+            "add.w      " $a "," $a ", $t5;"
+            "add.w      " $a "," $a ", $t6;"
+            "add.w      " $a "," $a ", $t4;"
+            "rotri.w    " $a "," $a ", 32 -" $s ";"
+            "add.w      " $a "," $a "," $b ";"
+        )
+    }
+}
+
+pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
+    if blocks.is_empty() {
+        return;
+    }
+
+    unsafe {
+        asm!(
+            "42:",
+
+            "move    $t0, $a4",
+            "move    $t1, $a5",
+            "move    $t2, $a6",
+            "move    $t3, $a7",
+
+            /* 64 rounds of hashing */
+            round0!("$t0", "$t1", "$t2", "$t3",  0,  7,  0),
+            round0!("$t3", "$t0", "$t1", "$t2",  1, 12,  1),
+            round0!("$t2", "$t3", "$t0", "$t1",  2, 17,  2),
+            round0!("$t1", "$t2", "$t3", "$t0",  3, 22,  3),
+            round0!("$t0", "$t1", "$t2", "$t3",  4,  7,  4),
+            round0!("$t3", "$t0", "$t1", "$t2",  5, 12,  5),
+            round0!("$t2", "$t3", "$t0", "$t1",  6, 17,  6),
+            round0!("$t1", "$t2", "$t3", "$t0",  7, 22,  7),
+            round0!("$t0", "$t1", "$t2", "$t3",  8,  7,  8),
+            round0!("$t3", "$t0", "$t1", "$t2",  9, 12,  9),
+            round0!("$t2", "$t3", "$t0", "$t1", 10, 17, 10),
+            round0!("$t1", "$t2", "$t3", "$t0", 11, 22, 11),
+            round0!("$t0", "$t1", "$t2", "$t3", 12,  7, 12),
+            round0!("$t3", "$t0", "$t1", "$t2", 13, 12, 13),
+            round0!("$t2", "$t3", "$t0", "$t1", 14, 17, 14),
+            round0!("$t1", "$t2", "$t3", "$t0", 15, 22, 15),
+            round1!("$t0", "$t1", "$t2", "$t3",  1,  5, 16),
+            round1!("$t3", "$t0", "$t1", "$t2",  6,  9, 17),
+            round1!("$t2", "$t3", "$t0", "$t1", 11, 14, 18),
+            round1!("$t1", "$t2", "$t3", "$t0",  0, 20, 19),
+            round1!("$t0", "$t1", "$t2", "$t3",  5,  5, 20),
+            round1!("$t3", "$t0", "$t1", "$t2", 10,  9, 21),
+            round1!("$t2", "$t3", "$t0", "$t1", 15, 14, 22),
+            round1!("$t1", "$t2", "$t3", "$t0",  4, 20, 23),
+            round1!("$t0", "$t1", "$t2", "$t3",  9,  5, 24),
+            round1!("$t3", "$t0", "$t1", "$t2", 14,  9, 25),
+            round1!("$t2", "$t3", "$t0", "$t1",  3, 14, 26),
+            round1!("$t1", "$t2", "$t3", "$t0",  8, 20, 27),
+            round1!("$t0", "$t1", "$t2", "$t3", 13,  5, 28),
+            round1!("$t3", "$t0", "$t1", "$t2",  2,  9, 29),
+            round1!("$t2", "$t3", "$t0", "$t1",  7, 14, 30),
+            round1!("$t1", "$t2", "$t3", "$t0", 12, 20, 31),
+            round2!("$t0", "$t1", "$t2", "$t3",  5,  4, 32),
+            round2!("$t3", "$t0", "$t1", "$t2",  8, 11, 33),
+            round2!("$t2", "$t3", "$t0", "$t1", 11, 16, 34),
+            round2!("$t1", "$t2", "$t3", "$t0", 14, 23, 35),
+            round2!("$t0", "$t1", "$t2", "$t3",  1,  4, 36),
+            round2!("$t3", "$t0", "$t1", "$t2",  4, 11, 37),
+            round2!("$t2", "$t3", "$t0", "$t1",  7, 16, 38),
+            round2!("$t1", "$t2", "$t3", "$t0", 10, 23, 39),
+            round2!("$t0", "$t1", "$t2", "$t3", 13,  4, 40),
+            round2!("$t3", "$t0", "$t1", "$t2",  0, 11, 41),
+            round2!("$t2", "$t3", "$t0", "$t1",  3, 16, 42),
+            round2!("$t1", "$t2", "$t3", "$t0",  6, 23, 43),
+            round2!("$t0", "$t1", "$t2", "$t3",  9,  4, 44),
+            round2!("$t3", "$t0", "$t1", "$t2", 12, 11, 45),
+            round2!("$t2", "$t3", "$t0", "$t1", 15, 16, 46),
+            round2!("$t1", "$t2", "$t3", "$t0",  2, 23, 47),
+            round3!("$t0", "$t1", "$t2", "$t3",  0,  6, 48),
+            round3!("$t3", "$t0", "$t1", "$t2",  7, 10, 49),
+            round3!("$t2", "$t3", "$t0", "$t1", 14, 15, 50),
+            round3!("$t1", "$t2", "$t3", "$t0",  5, 21, 51),
+            round3!("$t0", "$t1", "$t2", "$t3", 12,  6, 52),
+            round3!("$t3", "$t0", "$t1", "$t2",  3, 10, 53),
+            round3!("$t2", "$t3", "$t0", "$t1", 10, 15, 54),
+            round3!("$t1", "$t2", "$t3", "$t0",  1, 21, 55),
+            round3!("$t0", "$t1", "$t2", "$t3",  8,  6, 56),
+            round3!("$t3", "$t0", "$t1", "$t2", 15, 10, 57),
+            round3!("$t2", "$t3", "$t0", "$t1",  6, 15, 58),
+            round3!("$t1", "$t2", "$t3", "$t0", 13, 21, 59),
+            round3!("$t0", "$t1", "$t2", "$t3",  4,  6, 60),
+            round3!("$t3", "$t0", "$t1", "$t2", 11, 10, 61),
+            round3!("$t2", "$t3", "$t0", "$t1",  2, 15, 62),
+            round3!("$t1", "$t2", "$t3", "$t0",  9, 21, 63),
+
+            "add.w   $a4, $a4, $t0",
+            "add.w   $a5, $a5, $t1",
+            "add.w   $a6, $a6, $t2",
+            "add.w   $a7, $a7, $t3",
+
+            // Looping over blocks
+            "addi.d  $a1, $a1, 64",
+            "addi.d  $a2, $a2, -1",
+            "bnez    $a2, 42b",
+
+            inout("$a4") state[0],
+            inout("$a5") state[1],
+            inout("$a6") state[2],
+            inout("$a7") state[3],
+            inout("$a1") blocks.as_ptr() => _,
+            inout("$a2") blocks.len() => _,
+
+            in("$a3") RC.as_ptr(),
+
+            // Clobbers
+            out("$t0") _,
+            out("$t1") _,
+            out("$t2") _,
+            out("$t3") _,
+            out("$t4") _,
+            out("$t5") _,
+            out("$t6") _,
+
+            options(preserves_flags, readonly, pure, nostack),
+        );
+    }
+}
diff --git a/md5/src/compress/soft.rs b/md5/src/compress/soft.rs
new file mode 100644
index 000000000..c41f2bb41
--- /dev/null
+++ b/md5/src/compress/soft.rs
@@ -0,0 +1,154 @@
+#![allow(clippy::many_single_char_names, clippy::unreadable_literal)]
+use core::convert::TryInto;
+
+#[path = "consts.rs"]
+mod consts;
+use consts::*;
+
+#[inline(always)]
+fn op_f(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
+    ((x & y) | (!x & z))
+        .wrapping_add(w)
+        .wrapping_add(m)
+        .wrapping_add(c)
+        .rotate_left(s)
+        .wrapping_add(x)
+}
+#[inline(always)]
+fn op_g(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
+    ((x & z) | (y & !z))
+        .wrapping_add(w)
+        .wrapping_add(m)
+        .wrapping_add(c)
+        .rotate_left(s)
+        .wrapping_add(x)
+}
+
+#[inline(always)]
+fn op_h(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
+    (x ^ y ^ z)
+        .wrapping_add(w)
+        .wrapping_add(m)
+        .wrapping_add(c)
+        .rotate_left(s)
+        .wrapping_add(x)
+}
+
+#[inline(always)]
+fn op_i(w: u32, x: u32, y: u32, z: u32, m: u32, c: u32, s: u32) -> u32 {
+    (y ^ (x | !z))
+        .wrapping_add(w)
+        .wrapping_add(m)
+        .wrapping_add(c)
+        .rotate_left(s)
+        .wrapping_add(x)
+}
+
+#[inline]
+pub fn compress_block(state: &mut [u32; 4], input: &[u8; 64]) {
+    let mut a = state[0];
+    let mut b = state[1];
+    let mut c = state[2];
+    let mut d = state[3];
+
+    let mut data = [0u32; 16];
+    for (o, chunk) in data.iter_mut().zip(input.chunks_exact(4)) {
+        *o = u32::from_le_bytes(chunk.try_into().unwrap());
+    }
+
+    // round 1
+    a = op_f(a, b, c, d, data[0], RC[0], 7);
+    d = op_f(d, a, b, c, data[1], RC[1], 12);
+    c = op_f(c, d, a, b, data[2], RC[2], 17);
+    b = op_f(b, c, d, a, data[3], RC[3], 22);
+
+    a = op_f(a, b, c, d, data[4], RC[4], 7);
+    d = op_f(d, a, b, c, data[5], RC[5], 12);
+    c = op_f(c, d, a, b, data[6], RC[6], 17);
+    b = op_f(b, c, d, a, data[7], RC[7], 22);
+
+    a = op_f(a, b, c, d, data[8], RC[8], 7);
+    d = op_f(d, a, b, c, data[9], RC[9], 12);
+    c = op_f(c, d, a, b, data[10], RC[10], 17);
+    b = op_f(b, c, d, a, data[11], RC[11], 22);
+
+    a = op_f(a, b, c, d, data[12], RC[12], 7);
+    d = op_f(d, a, b, c, data[13], RC[13], 12);
+    c = op_f(c, d, a, b, data[14], RC[14], 17);
+    b = op_f(b, c, d, a, data[15], RC[15], 22);
+
+    // round 2
+    a = op_g(a, b, c, d, data[1], RC[16], 5);
+    d = op_g(d, a, b, c, data[6], RC[17], 9);
+    c = op_g(c, d, a, b, data[11], RC[18], 14);
+    b = op_g(b, c, d, a, data[0], RC[19], 20);
+
+    a = op_g(a, b, c, d, data[5], RC[20], 5);
+    d = op_g(d, a, b, c, data[10], RC[21], 9);
+    c = op_g(c, d, a, b, data[15], RC[22], 14);
+    b = op_g(b, c, d, a, data[4], RC[23], 20);
+
+    a = op_g(a, b, c, d, data[9], RC[24], 5);
+    d = op_g(d, a, b, c, data[14], RC[25], 9);
+    c = op_g(c, d, a, b, data[3], RC[26], 14);
+    b = op_g(b, c, d, a, data[8], RC[27], 20);
+
+    a = op_g(a, b, c, d, data[13], RC[28], 5);
+    d = op_g(d, a, b, c, data[2], RC[29], 9);
+    c = op_g(c, d, a, b, data[7], RC[30], 14);
+    b = op_g(b, c, d, a, data[12], RC[31], 20);
+
+    // round 3
+    a = op_h(a, b, c, d, data[5], RC[32], 4);
+    d = op_h(d, a, b, c, data[8], RC[33], 11);
+    c = op_h(c, d, a, b, data[11], RC[34], 16);
+    b = op_h(b, c, d, a, data[14], RC[35], 23);
+
+    a = op_h(a, b, c, d, data[1], RC[36], 4);
+    d = op_h(d, a, b, c, data[4], RC[37], 11);
+    c = op_h(c, d, a, b, data[7], RC[38], 16);
+    b = op_h(b, c, d, a, data[10], RC[39], 23);
+
+    a = op_h(a, b, c, d, data[13], RC[40], 4);
+    d = op_h(d, a, b, c, data[0], RC[41], 11);
+    c = op_h(c, d, a, b, data[3], RC[42], 16);
+    b = op_h(b, c, d, a, data[6], RC[43], 23);
+
+    a = op_h(a, b, c, d, data[9], RC[44], 4);
+    d = op_h(d, a, b, c, data[12], RC[45], 11);
+    c = op_h(c, d, a, b, data[15], RC[46], 16);
+    b = op_h(b, c, d, a, data[2], RC[47], 23);
+
+    // round 4
+    a = op_i(a, b, c, d, data[0], RC[48], 6);
+    d = op_i(d, a, b, c, data[7], RC[49], 10);
+    c = op_i(c, d, a, b, data[14], RC[50], 15);
+    b = op_i(b, c, d, a, data[5], RC[51], 21);
+
+    a = op_i(a, b, c, d, data[12], RC[52], 6);
+    d = op_i(d, a, b, c, data[3], RC[53], 10);
+    c = op_i(c, d, a, b, data[10], RC[54], 15);
+    b = op_i(b, c, d, a, data[1], RC[55], 21);
+
+    a = op_i(a, b, c, d, data[8], RC[56], 6);
+    d = op_i(d, a, b, c, data[15], RC[57], 10);
+    c = op_i(c, d, a, b, data[6], RC[58], 15);
+    b = op_i(b, c, d, a, data[13], RC[59], 21);
+
+    a = op_i(a, b, c, d, data[4], RC[60], 6);
+    d = op_i(d, a, b, c, data[11], RC[61], 10);
+    c = op_i(c, d, a, b, data[2], RC[62], 15);
+    b = op_i(b, c, d, a, data[9], RC[63], 21);
+
+    state[0] = state[0].wrapping_add(a);
+    state[1] = state[1].wrapping_add(b);
+    state[2] = state[2].wrapping_add(c);
+    state[3] = state[3].wrapping_add(d);
+}
+
+#[inline]
+pub fn compress(state: &mut [u32; 4], blocks: &[[u8; 64]]) {
+    for block in blocks {
+        compress_block(state, block)
+    }
+}
diff --git a/md5/src/lib.rs b/md5/src/lib.rs
index 87fe9134f..085275ba9 100644
--- a/md5/src/lib.rs
+++ b/md5/src/lib.rs
@@ -30,14 +30,9 @@
 )]
 #![warn(missing_docs, rust_2018_idioms)]
 
-#[cfg(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64")))]
-extern crate md5_asm as compress;
-
-#[cfg(not(all(feature = "asm", any(target_arch = "x86", target_arch = "x86_64"))))]
-mod compress;
-
 pub use digest::{self, Digest};
 
+mod compress;
 use compress::compress;
 
 use core::{fmt, slice::from_ref};