Skip to content

Commit 86b9550

Browse files
committed
rustdoc-search: tighter encoding for f index
Two optimizations for the function signature search: * Instead of using JSON arrays, like `[1,20]`, it uses VLQ hex with no commas, like `[aAd]`. * This also adds backrefs: if you have more than one function with exactly the same signature, it'll not only store it once, it'll *decode* it once, and store in the typeIdMap only once. Size change ----------- standard library ```console $ du -bs search-index-old.js search-index-new.js 4976370 search-index-old.js 4404391 search-index-new.js ``` ((4976370-4404391)/4404391)*100% = 12.9% Benchmarks are similarly shrunk: ```console $ du -hs tmp/{arti,cortex-m,sqlx,stm32f4,ripgrep}/toolchain_{old,new}/doc/search-index.js 10555067 tmp/arti/toolchain_old/doc/search-index.js 8921236 tmp/arti/toolchain_new/doc/search-index.js 77018 tmp/cortex-m/toolchain_old/doc/search-index.js 66676 tmp/cortex-m/toolchain_new/doc/search-index.js 2876330 tmp/sqlx/toolchain_old/doc/search-index.js 2436812 tmp/sqlx/toolchain_new/doc/search-index.js 63632890 tmp/stm32f4/toolchain_old/doc/search-index.js 52337438 tmp/stm32f4/toolchain_new/doc/search-index.js 631150 tmp/ripgrep/toolchain_old/doc/search-index.js 541646 tmp/ripgrep/toolchain_new/doc/search-index.js ```
1 parent 1ab60f2 commit 86b9550

File tree

3 files changed

+170
-86
lines changed

3 files changed

+170
-86
lines changed

src/librustdoc/html/render/mod.rs

+97-49
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ use rustc_span::{
5858
symbol::{sym, Symbol},
5959
BytePos, FileName, RealFileName,
6060
};
61-
use serde::ser::{SerializeMap, SerializeSeq};
61+
use serde::ser::SerializeMap;
6262
use serde::{Serialize, Serializer};
6363

6464
use crate::clean::{self, ItemId, RenderedLink, SelfTy};
@@ -123,115 +123,163 @@ pub(crate) struct IndexItem {
123123
}
124124

125125
/// A type used for the search index.
126-
#[derive(Debug)]
126+
#[derive(Debug, Eq, PartialEq)]
127127
pub(crate) struct RenderType {
128128
id: Option<RenderTypeId>,
129129
generics: Option<Vec<RenderType>>,
130130
bindings: Option<Vec<(RenderTypeId, Vec<RenderType>)>>,
131131
}
132132

133-
impl Serialize for RenderType {
134-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
135-
where
136-
S: Serializer,
137-
{
138-
let id = match &self.id {
139-
// 0 is a sentinel, everything else is one-indexed
140-
None => 0,
141-
// concrete type
142-
Some(RenderTypeId::Index(idx)) if *idx >= 0 => idx + 1,
143-
// generic type parameter
144-
Some(RenderTypeId::Index(idx)) => *idx,
145-
_ => panic!("must convert render types to indexes before serializing"),
146-
};
133+
impl RenderType {
134+
pub fn write_to_string(&self, string: &mut String) {
147135
if self.generics.is_some() || self.bindings.is_some() {
148-
let mut seq = serializer.serialize_seq(None)?;
149-
seq.serialize_element(&id)?;
150-
seq.serialize_element(self.generics.as_ref().map(Vec::as_slice).unwrap_or_default())?;
136+
string.push('{');
137+
// 0 is a sentinel, everything else is one-indexed
138+
match self.id {
139+
Some(id) => id.write_to_string(string),
140+
None => string.push('`'),
141+
}
142+
string.push('{');
143+
for generic in &self.generics.as_ref().map(Vec::as_slice).unwrap_or_default()[..] {
144+
generic.write_to_string(string);
145+
}
146+
string.push('}');
151147
if self.bindings.is_some() {
152-
seq.serialize_element(
153-
self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default(),
154-
)?;
148+
string.push('{');
149+
for binding in &self.bindings.as_ref().map(Vec::as_slice).unwrap_or_default()[..] {
150+
string.push('{');
151+
binding.0.write_to_string(string);
152+
string.push('{');
153+
for constraint in &binding.1[..] {
154+
constraint.write_to_string(string);
155+
}
156+
string.push('}');
157+
string.push('}');
158+
}
159+
string.push('}');
155160
}
156-
seq.end()
161+
string.push('}');
157162
} else {
158-
id.serialize(serializer)
163+
// 0 is a sentinel, everything else is one-indexed
164+
match self.id {
165+
Some(id) => id.write_to_string(string),
166+
None => string.push('`'),
167+
}
159168
}
160169
}
161170
}
162171

163-
#[derive(Clone, Copy, Debug)]
172+
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
164173
pub(crate) enum RenderTypeId {
165174
DefId(DefId),
166175
Primitive(clean::PrimitiveType),
167176
AssociatedType(Symbol),
168177
Index(isize),
169178
}
170179

171-
impl Serialize for RenderTypeId {
172-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
173-
where
174-
S: Serializer,
175-
{
176-
let id = match &self {
180+
impl RenderTypeId {
181+
pub fn write_to_string(&self, string: &mut String) {
182+
// (sign, value)
183+
let (sign, id): (bool, u32) = match &self {
177184
// 0 is a sentinel, everything else is one-indexed
178185
// concrete type
179-
RenderTypeId::Index(idx) if *idx >= 0 => idx + 1,
186+
RenderTypeId::Index(idx) if *idx >= 0 => (false, (idx + 1isize).try_into().unwrap()),
180187
// generic type parameter
181-
RenderTypeId::Index(idx) => *idx,
188+
RenderTypeId::Index(idx) => (true, (-*idx).try_into().unwrap()),
182189
_ => panic!("must convert render types to indexes before serializing"),
183190
};
184-
id.serialize(serializer)
191+
// zig-zag notation
192+
let value: u32 = (id << 1) | (if sign { 1 } else { 0 });
193+
// encode
194+
let mut shift: u32 = 28;
195+
let mut mask: u32 = 0xF0_00_00_00;
196+
while shift < 32 {
197+
let hexit = (value & mask) >> shift;
198+
if hexit != 0 || shift == 0 {
199+
let hex =
200+
char::try_from(if shift == 0 { '`' } else { '@' } as u32 + hexit).unwrap();
201+
string.push(hex);
202+
}
203+
shift = shift.wrapping_sub(4);
204+
mask = mask >> 4;
205+
}
185206
}
186207
}
187208

188209
/// Full type of functions/methods in the search index.
189-
#[derive(Debug)]
210+
#[derive(Debug, Eq, PartialEq)]
190211
pub(crate) struct IndexItemFunctionType {
191212
inputs: Vec<RenderType>,
192213
output: Vec<RenderType>,
193214
where_clause: Vec<Vec<RenderType>>,
194215
}
195216

196-
impl Serialize for IndexItemFunctionType {
197-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
198-
where
199-
S: Serializer,
200-
{
217+
impl IndexItemFunctionType {
218+
pub fn write_to_string<'a>(
219+
&'a self,
220+
string: &mut String,
221+
backref_queue: &mut VecDeque<&'a IndexItemFunctionType>,
222+
) {
223+
assert!(backref_queue.len() < 16);
201224
// If we couldn't figure out a type, just write `0`.
202225
let has_missing = self
203226
.inputs
204227
.iter()
205228
.chain(self.output.iter())
206229
.any(|i| i.id.is_none() && i.generics.is_none());
207230
if has_missing {
208-
0.serialize(serializer)
231+
string.push('`');
232+
} else if let Some(idx) = backref_queue.iter().position(|other| *other == self) {
233+
string.push(
234+
char::try_from('0' as u32 + u32::try_from(idx).unwrap())
235+
.expect("last possible value is '?'"),
236+
);
209237
} else {
210-
let mut seq = serializer.serialize_seq(None)?;
238+
backref_queue.push_front(self);
239+
if backref_queue.len() >= 16 {
240+
backref_queue.pop_back();
241+
}
242+
string.push('{');
211243
match &self.inputs[..] {
212244
[one] if one.generics.is_none() && one.bindings.is_none() => {
213-
seq.serialize_element(one)?
245+
one.write_to_string(string);
246+
}
247+
_ => {
248+
string.push('{');
249+
for item in &self.inputs[..] {
250+
item.write_to_string(string);
251+
}
252+
string.push('}');
214253
}
215-
_ => seq.serialize_element(&self.inputs)?,
216254
}
217255
match &self.output[..] {
218256
[] if self.where_clause.is_empty() => {}
219257
[one] if one.generics.is_none() && one.bindings.is_none() => {
220-
seq.serialize_element(one)?
258+
one.write_to_string(string);
259+
}
260+
_ => {
261+
string.push('{');
262+
for item in &self.output[..] {
263+
item.write_to_string(string);
264+
}
265+
string.push('}');
221266
}
222-
_ => seq.serialize_element(&self.output)?,
223267
}
224268
for constraint in &self.where_clause {
225269
if let [one] = &constraint[..]
226270
&& one.generics.is_none()
227271
&& one.bindings.is_none()
228272
{
229-
seq.serialize_element(one)?;
273+
one.write_to_string(string);
230274
} else {
231-
seq.serialize_element(constraint)?;
275+
string.push('{');
276+
for item in &constraint[..] {
277+
item.write_to_string(string);
278+
}
279+
string.push('}');
232280
}
233281
}
234-
seq.end()
282+
string.push('}');
235283
}
236284
}
237285
}

src/librustdoc/html/render/search_index.rs

+7-22
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
use std::collections::hash_map::Entry;
2-
use std::collections::BTreeMap;
2+
use std::collections::{BTreeMap, VecDeque};
33

44
use rustc_data_structures::fx::{FxHashMap, FxIndexMap};
55
use rustc_middle::ty::TyCtxt;
@@ -409,9 +409,11 @@ pub(crate) fn build_index<'tcx>(
409409
let mut full_paths = Vec::with_capacity(self.items.len());
410410
let mut descriptions = Vec::with_capacity(self.items.len());
411411
let mut parents = Vec::with_capacity(self.items.len());
412-
let mut functions = Vec::with_capacity(self.items.len());
412+
let mut functions = String::with_capacity(self.items.len());
413413
let mut deprecated = Vec::with_capacity(self.items.len());
414414

415+
let mut backref_queue = VecDeque::new();
416+
415417
for (index, item) in self.items.iter().enumerate() {
416418
let n = item.ty as u8;
417419
let c = char::try_from(n + b'A').expect("item types must fit in ASCII");
@@ -434,27 +436,10 @@ pub(crate) fn build_index<'tcx>(
434436
full_paths.push((index, &item.path));
435437
}
436438

437-
// Fake option to get `0` out as a sentinel instead of `null`.
438-
// We want to use `0` because it's three less bytes.
439-
enum FunctionOption<'a> {
440-
Function(&'a IndexItemFunctionType),
441-
None,
442-
}
443-
impl<'a> Serialize for FunctionOption<'a> {
444-
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
445-
where
446-
S: Serializer,
447-
{
448-
match self {
449-
FunctionOption::None => 0.serialize(serializer),
450-
FunctionOption::Function(ty) => ty.serialize(serializer),
451-
}
452-
}
439+
match &item.search_type {
440+
Some(ty) => ty.write_to_string(&mut functions, &mut backref_queue),
441+
None => functions.push('`'),
453442
}
454-
functions.push(match &item.search_type {
455-
Some(ty) => FunctionOption::Function(ty),
456-
None => FunctionOption::None,
457-
});
458443

459444
if item.deprecation.is_some() {
460445
deprecated.push(index);

src/librustdoc/html/static/js/search.js

+66-15
Original file line numberDiff line numberDiff line change
@@ -2767,19 +2767,65 @@ ${item.displayPath}<span class="${type}">${name}</span>\
27672767
* The raw function search type format is generated using serde in
27682768
* librustdoc/html/render/mod.rs: impl Serialize for IndexItemFunctionType
27692769
*
2770-
* @param {RawFunctionSearchType} functionSearchType
2770+
* @param {{
2771+
* string: string,
2772+
* offset: number,
2773+
* backrefQueue: FunctionSearchType[]
2774+
* }} itemFunctionDecoder
27712775
* @param {Array<{name: string, ty: number}>} lowercasePaths
27722776
* @param {Map<string, integer>}
27732777
*
27742778
* @return {null|FunctionSearchType}
27752779
*/
2776-
function buildFunctionSearchType(functionSearchType, lowercasePaths) {
2777-
const INPUTS_DATA = 0;
2778-
const OUTPUT_DATA = 1;
2779-
// `0` is used as a sentinel because it's fewer bytes than `null`
2780-
if (functionSearchType === 0) {
2780+
function buildFunctionSearchType(itemFunctionDecoder, lowercasePaths) {
2781+
const c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2782+
itemFunctionDecoder.offset += 1;
2783+
const [zero, ua, la, ob, cb] = ["0", "@", "`", "{", "}"].map(c => c.charCodeAt(0));
2784+
// `` ` `` is used as a sentinel because it's fewer bytes than `null`, and decodes to zero
2785+
// `0` is a backref
2786+
if (c === la) {
27812787
return null;
27822788
}
2789+
// sixteen characters after "0" are backref
2790+
if (c >= zero && c < ua) {
2791+
return itemFunctionDecoder.backrefQueue[c - zero];
2792+
}
2793+
if (c !== ob) {
2794+
throw ["Unexpected ", c, " in function: expected ", "{", "; this is a bug"];
2795+
}
2796+
// call after consuming `{`
2797+
function decodeList() {
2798+
let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2799+
const ret = [];
2800+
while (c !== cb) {
2801+
ret.push(decode());
2802+
c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2803+
}
2804+
itemFunctionDecoder.offset += 1; // eat cb
2805+
return ret;
2806+
}
2807+
// consumes and returns a list or integer
2808+
function decode() {
2809+
let n = 0;
2810+
let c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2811+
if (c === ob) {
2812+
itemFunctionDecoder.offset += 1;
2813+
return decodeList();
2814+
}
2815+
while (c < la) {
2816+
n = (n << 4) | (c & 0xF);
2817+
itemFunctionDecoder.offset += 1;
2818+
c = itemFunctionDecoder.string.charCodeAt(itemFunctionDecoder.offset);
2819+
}
2820+
// last character >= la
2821+
n = (n << 4) | (c & 0xF);
2822+
const [sign, value] = [n & 1, n >> 1];
2823+
itemFunctionDecoder.offset += 1;
2824+
return sign ? -value : value;
2825+
}
2826+
const functionSearchType = decodeList();
2827+
const INPUTS_DATA = 0;
2828+
const OUTPUT_DATA = 1;
27832829
let inputs, output;
27842830
if (typeof functionSearchType[INPUTS_DATA] === "number") {
27852831
inputs = [buildItemSearchType(functionSearchType[INPUTS_DATA], lowercasePaths)];
@@ -2808,9 +2854,14 @@ ${item.displayPath}<span class="${type}">${name}</span>\
28082854
? [buildItemSearchType(functionSearchType[i], lowercasePaths)]
28092855
: buildItemSearchTypeAll(functionSearchType[i], lowercasePaths));
28102856
}
2811-
return {
2857+
const ret = {
28122858
inputs, output, where_clause,
28132859
};
2860+
itemFunctionDecoder.backrefQueue.unshift(ret);
2861+
if (itemFunctionDecoder.backrefQueue.length >= 16) {
2862+
itemFunctionDecoder.backrefQueue.pop();
2863+
}
2864+
return ret;
28142865
}
28152866

28162867
/**
@@ -2992,8 +3043,12 @@ ${item.displayPath}<span class="${type}">${name}</span>\
29923043
const itemDescs = crateCorpus.d;
29933044
// an array of (Number) the parent path index + 1 to `paths`, or 0 if none
29943045
const itemParentIdxs = crateCorpus.i;
2995-
// an array of (Array | 0) the type of the function, if any
2996-
const itemFunctionSearchTypes = crateCorpus.f;
3046+
// a string representing the list of function types
3047+
const itemFunctionDecoder = {
3048+
string: crateCorpus.f,
3049+
offset: 0,
3050+
backrefQueue: [],
3051+
};
29973052
// an array of (Number) indices for the deprecated items
29983053
const deprecatedItems = new Set(crateCorpus.c);
29993054
// an array of (Number) indices for the deprecated items
@@ -3041,12 +3096,8 @@ ${item.displayPath}<span class="${type}">${name}</span>\
30413096
word = itemNames[i].toLowerCase();
30423097
}
30433098
const path = itemPaths.has(i) ? itemPaths.get(i) : lastPath;
3044-
let type = null;
3045-
if (itemFunctionSearchTypes[i] !== 0) {
3046-
type = buildFunctionSearchType(
3047-
itemFunctionSearchTypes[i],
3048-
lowercasePaths
3049-
);
3099+
const type = buildFunctionSearchType(itemFunctionDecoder, lowercasePaths);
3100+
if (type !== null) {
30503101
if (type) {
30513102
const fp = functionTypeFingerprint.subarray(id * 4, (id + 1) * 4);
30523103
const fps = new Set();

0 commit comments

Comments
 (0)