Rollup merge of rust-lang#37427 - nnethercote:opt-IchHasher, r=michae…

…lwoerister Reduce the number of bytes hashed by IchHasher. IchHasher uses blake2b hashing, which is expensive, so the fewer bytes hashed the better. There are two big ways to reduce the number of bytes hashed. - Filenames in spans account for ~66% of all bytes (for builds with debuginfo). The vast majority of spans have the same filename for the start of the span and the end of the span, so hashing the filename just once in those cases is a big win. - u32 and u64 and usize values account for ~25%--33% of all bytes (for builds with debuginfo). The vast majority of these are small, i.e. fit in a u8, so shrinking them down before hashing is also a big win. This PR implements these two optimizations. I'm certain the first one is safe. I'm about 90% sure that the second one is safe. Here are measurements of the number of bytes hashed when doing debuginfo-enabled builds of stdlib and rustc-benchmarks/syntex-0.42.2-incr-clean. ``` stdlib syntex-incr ------ ----------- original 156,781,386 255,095,596 half-SawSpan 106,744,403 176,345,419 short-ints 45,890,534 118,014,227 no-SawSpan[*] 6,831,874 45,875,714 [*] don't hash the SawSpan at all. Not part of this PR, just implemented for comparison's sake. ``` For debug builds of syntex-0.42.2-incr-clean, the two changes give a 1--2% speed-up.
sophiajt · Nov 4, 2016 · e5ebb60 · e5ebb60
2 parents a267214 + d73c68c
commit e5ebb60
Show file tree

Hide file tree

Showing 2 changed files with 68 additions and 12 deletions.
diff --git a/src/librustc_incremental/calculate_svh/hasher.rs b/src/librustc_incremental/calculate_svh/hasher.rs
@@ -9,13 +9,16 @@
 // except according to those terms.
 
 use std::mem;
+use std::hash::Hasher;
 use rustc_data_structures::blake2b::Blake2bHasher;
 use rustc::ty::util::ArchIndependentHasher;
 use ich::Fingerprint;
+use rustc_serialize::leb128::write_unsigned_leb128;
 
 #[derive(Debug)]
 pub struct IchHasher {
     state: ArchIndependentHasher<Blake2bHasher>,
+    leb128_helper: Vec<u8>,
     bytes_hashed: u64,
 }
 
@@ -24,6 +27,7 @@ impl IchHasher {
         let hash_size = mem::size_of::<Fingerprint>();
         IchHasher {
             state: ArchIndependentHasher::new(Blake2bHasher::new(hash_size, &[])),
+            leb128_helper: vec![],
             bytes_hashed: 0
         }
     }
@@ -37,9 +41,19 @@ impl IchHasher {
         fingerprint.0.copy_from_slice(self.state.into_inner().finalize());
         fingerprint
     }
+
+    #[inline]
+    fn write_uleb128(&mut self, value: u64) {
+        let len = write_unsigned_leb128(&mut self.leb128_helper, 0, value);
+        self.state.write(&self.leb128_helper[0..len]);
+        self.bytes_hashed += len as u64;
+    }
 }
 
-impl ::std::hash::Hasher for IchHasher {
+// For the non-u8 integer cases we leb128 encode them first. Because small
+// integers dominate, this significantly and cheaply reduces the number of
+// bytes hashed, which is good because blake2b is expensive.
+impl Hasher for IchHasher {
     fn finish(&self) -> u64 {
         bug!("Use other finish() implementation to get the full 128-bit hash.");
     }
@@ -49,4 +63,26 @@ impl ::std::hash::Hasher for IchHasher {
         self.state.write(bytes);
         self.bytes_hashed += bytes.len() as u64;
     }
+
+    // There is no need to leb128-encode u8 values.
+
+    #[inline]
+    fn write_u16(&mut self, i: u16) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u32(&mut self, i: u32) {
+        self.write_uleb128(i as u64);
+    }
+
+    #[inline]
+    fn write_u64(&mut self, i: u64) {
+        self.write_uleb128(i);
+    }
+
+    #[inline]
+    fn write_usize(&mut self, i: usize) {
+        self.write_uleb128(i as u64);
+    }
 }
diff --git a/src/librustc_incremental/calculate_svh/svh_visitor.rs b/src/librustc_incremental/calculate_svh/svh_visitor.rs
@@ -88,6 +88,8 @@ impl<'a, 'hash, 'tcx> StrictVersionHashVisitor<'a, 'hash, 'tcx> {
     // within the CodeMap.
     // Also note that we are hashing byte offsets for the column, not unicode
     // codepoint offsets. For the purpose of the hash that's sufficient.
+    // Also, hashing filenames is expensive so we avoid doing it twice when the
+    // span starts and ends in the same file, which is almost always the case.
     fn hash_span(&mut self, span: Span) {
         debug!("hash_span: st={:?}", self.st);
 
@@ -103,21 +105,35 @@ impl<'a, 'hash, 'tcx> StrictVersionHashVisitor<'a, 'hash, 'tcx> {
             span.hi
         };
 
-        let loc1 = self.codemap.byte_pos_to_line_and_col(span.lo);
-        let loc2 = self.codemap.byte_pos_to_line_and_col(span_hi);
-
-        let expansion_kind = match span.expn_id {
+        let expn_kind = match span.expn_id {
             NO_EXPANSION => SawSpanExpnKind::NoExpansion,
             COMMAND_LINE_EXPN => SawSpanExpnKind::CommandLine,
             _ => SawSpanExpnKind::SomeExpansion,
         };
 
-        SawSpan(loc1.as_ref().map(|&(ref fm, line, col)| (&fm.name[..], line, col)),
-                loc2.as_ref().map(|&(ref fm, line, col)| (&fm.name[..], line, col)),
-                expansion_kind)
-            .hash(self.st);
+        let loc1 = self.codemap.byte_pos_to_line_and_col(span.lo);
+        let loc1 = loc1.as_ref()
+                       .map(|&(ref fm, line, col)| (&fm.name[..], line, col))
+                       .unwrap_or(("???", 0, BytePos(0)));
+
+        let loc2 = self.codemap.byte_pos_to_line_and_col(span_hi);
+        let loc2 = loc2.as_ref()
+                       .map(|&(ref fm, line, col)| (&fm.name[..], line, col))
+                       .unwrap_or(("???", 0, BytePos(0)));
+
+        let saw = if loc1.0 == loc2.0 {
+            SawSpan(loc1.0,
+                    loc1.1, loc1.2,
+                    loc2.1, loc2.2,
+                    expn_kind)
+        } else {
+            SawSpanTwoFiles(loc1.0, loc1.1, loc1.2,
+                            loc2.0, loc2.1, loc2.2,
+                            expn_kind)
+        };
+        saw.hash(self.st);
 
-        if expansion_kind == SawSpanExpnKind::SomeExpansion {
+        if expn_kind == SawSpanExpnKind::SomeExpansion {
             let call_site = self.codemap.codemap().source_callsite(span);
             self.hash_span(call_site);
         }
@@ -189,9 +205,13 @@ enum SawAbiComponent<'a> {
     SawAssocTypeBinding,
     SawAttribute(ast::AttrStyle),
     SawMacroDef,
-    SawSpan(Option<(&'a str, usize, BytePos)>,
-            Option<(&'a str, usize, BytePos)>,
+    SawSpan(&'a str,
+            usize, BytePos,
+            usize, BytePos,
             SawSpanExpnKind),
+    SawSpanTwoFiles(&'a str, usize, BytePos,
+                    &'a str, usize, BytePos,
+                    SawSpanExpnKind),
 }
 
 /// SawExprComponent carries all of the information that we want