Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Cache the JSON documentation of path dependencies #340

Merged
merged 2 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions libs/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions libs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,15 @@ bimap = "0.6.2"
bincode = "1"
biscotti = "0.3"
bytes = "1.5.0"
camino = "1"
cargo-like-utils = "0.1.2"
cargo-manifest = "0.14.0"
clap = "4"
clap-stdin = "0.4.0"
config = "0.14.0"
console = "0.15.1"
convert_case = "0.6"
xxhash-rust = "0.8.12"
elsa = "1.4.0"
fixedbitset = "0.4"
fs-err = "2.11.0"
Expand Down
13 changes: 9 additions & 4 deletions libs/deny.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,15 @@ all-features = true
no-default-features = false

[advisories]
# `yaml` crate is unmaintained, but we depend on it transitively via `config`,
# so no easy way to remove it. Since it isn't vulnerable, we'll ignore the advisory
# for now.
ignore = ["RUSTSEC-2024-0320"]
ignore = [
# `yaml` crate is unmaintained, but we depend on it transitively via `config`,
# so no easy way to remove it. Since it isn't vulnerable, we'll ignore the advisory
# for now.
"RUSTSEC-2024-0320",
# `proc-macro-error` is unmaintained, but we depend on it transitively via `vergen-lib`.
# There is no runtime risk, so we'll ignore the advisory for now.
"RUSTSEC-2024-0370",
]

[licenses]
# List of explicitly allowed licenses
Expand Down
2 changes: 2 additions & 0 deletions libs/pavexc/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ semver = { workspace = true }
persist_if_changed = { path = "../persist_if_changed", version = "0.1.48" }
matchit = { version = "0.7", package = "pavex_matchit" }
relative-path = { workspace = true }
camino = { workspace = true }
xxhash-rust = { workspace = true, features = ["xxh64"] }

# Sqlite cache
xdg-home = { workspace = true }
Expand Down
225 changes: 144 additions & 81 deletions libs/pavexc/src/rustdoc/compute/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ use tracing::instrument;

use crate::rustdoc::queries::{CrateData, CrateItemIndex, LazyCrateItemIndex};

use super::rustdoc_options;
use super::{checksum::checksum_crate, rustdoc_options};

/// A cache for storing and retrieving pre-computed JSON documentation generated by `rustdoc`.
///
Expand Down Expand Up @@ -344,95 +344,124 @@ impl ThirdPartyCrateCache {
}

/// Retrieve the cached documentation for a given package, if available.
#[instrument(name = "Retrieve cached toolchain docs from disk",
#[instrument(name = "Retrieve third-party crate docs from disk cache",
skip_all,
level=tracing::Level::DEBUG,
fields(crate.name = %package_metadata.name())
fields(crate.id = %package_metadata.id(), cache_key = tracing::field::Empty, hit = tracing::field::Empty)
)]
fn get(
&self,
package_metadata: &PackageMetadata,
cargo_fingerprint: &str,
connection: &rusqlite::Connection,
) -> Result<Option<crate::rustdoc::Crate>, anyhow::Error> {
let Some(cache_key) = ThirdPartyCrateCacheKey::build(package_metadata, cargo_fingerprint)
else {
return Ok(None);
};
// Retrieve from rustdoc's output from cache, if available.
let mut stmt = connection.prepare_cached(
"SELECT
root_item_id,
external_crates,
paths,
format_version,
items,
item_id2delimiters,
id2public_import_paths,
id2private_import_paths,
import_path2id,
re_exports
FROM rustdoc_3d_party_crates_cache
WHERE crate_name = ? AND
crate_source = ? AND
crate_version = ? AND
cargo_fingerprint = ? AND
rustdoc_options = ? AND
default_feature_is_enabled = ? AND
active_named_features = ?",
)?;
let span = tracing::trace_span!("Execute query");
let guard = span.enter();
let mut rows = stmt.query(params![
cache_key.crate_name,
cache_key.crate_source,
cache_key.crate_version,
cache_key.cargo_fingerprint,
cache_key.rustdoc_options,
cache_key.default_feature_is_enabled,
cache_key.active_named_features
])?;
let Some(row) = rows.next()? else {
return Ok(None);
};
drop(guard);

let root_item_id = row.get_ref_unwrap(0).as_str()?;
let external_crates = row.get_ref_unwrap(1).as_bytes()?;
let paths = row.get_ref_unwrap(2).as_bytes()?;
let format_version = row.get_ref_unwrap(3).as_i64()?;

let span = tracing::trace_span!("Copy items bytes buffer");
let guard = span.enter();
let items: Vec<u8> = row.get_unwrap(4);
drop(guard);

let item_id2delimiters = row.get_ref_unwrap(5).as_bytes()?;
let id2public_import_paths = row.get_ref_unwrap(6).as_bytes()?;
let id2private_import_paths = row.get_ref_unwrap(7).as_bytes()?;
let import_path2id = row.get_ref_unwrap(8).as_bytes()?;
let re_exports = row.get_ref_unwrap(9).as_bytes()?;
fn _get(
package_metadata: &PackageMetadata,
cargo_fingerprint: &str,
connection: &rusqlite::Connection,
) -> Result<Option<crate::rustdoc::Crate>, anyhow::Error> {
let Some(cache_key) =
ThirdPartyCrateCacheKey::build(package_metadata, cargo_fingerprint)
else {
return Ok(None);
};
tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key));
// Retrieve from rustdoc's output from cache, if available.
let mut stmt = connection.prepare_cached(
"SELECT
root_item_id,
external_crates,
paths,
format_version,
items,
item_id2delimiters,
id2public_import_paths,
id2private_import_paths,
import_path2id,
re_exports
FROM rustdoc_3d_party_crates_cache
WHERE crate_name = ? AND
crate_source = ? AND
crate_version = ? AND
crate_hash = ? AND
cargo_fingerprint = ? AND
rustdoc_options = ? AND
default_feature_is_enabled = ? AND
active_named_features = ?",
)?;
let span = tracing::trace_span!("Execute query");
let guard = span.enter();
let mut rows = stmt.query(params![
cache_key.crate_name,
cache_key.crate_source,
cache_key.crate_version,
// `NULL` values are considered to be distinct from all other values
// by SQLite, including other `NULL`s. Therefore we use an empty
// string as a placeholder for `NULL` values.
cache_key.crate_hash.unwrap_or_default(),
cache_key.cargo_fingerprint,
cache_key.rustdoc_options,
cache_key.default_feature_is_enabled,
cache_key.active_named_features
])?;
let Some(row) = rows.next()? else {
return Ok(None);
};
drop(guard);

let root_item_id = row.get_ref_unwrap(0).as_str()?;
let external_crates = row.get_ref_unwrap(1).as_bytes()?;
let paths = row.get_ref_unwrap(2).as_bytes()?;
let format_version = row.get_ref_unwrap(3).as_i64()?;

let span = tracing::trace_span!("Copy items bytes buffer");
let guard = span.enter();
let items: Vec<u8> = row.get_unwrap(4);
drop(guard);

let item_id2delimiters = row.get_ref_unwrap(5).as_bytes()?;
let id2public_import_paths = row.get_ref_unwrap(6).as_bytes()?;
let id2private_import_paths = row.get_ref_unwrap(7).as_bytes()?;
let import_path2id = row.get_ref_unwrap(8).as_bytes()?;
let re_exports = row.get_ref_unwrap(9).as_bytes()?;

let krate = CachedData {
root_item_id,
external_crates: Cow::Borrowed(external_crates),
paths: Cow::Borrowed(paths),
format_version,
items: Cow::Owned(items),
item_id2delimiters: Cow::Borrowed(item_id2delimiters),
id2public_import_paths: Cow::Borrowed(id2public_import_paths),
id2private_import_paths: Cow::Borrowed(id2private_import_paths),
import_path2id: Cow::Borrowed(import_path2id),
re_exports: Cow::Borrowed(re_exports),
}
.hydrate(package_metadata.id().to_owned())
.context("Failed to re-hydrate the stored docs")?;

let krate = CachedData {
root_item_id,
external_crates: Cow::Borrowed(external_crates),
paths: Cow::Borrowed(paths),
format_version,
items: Cow::Owned(items),
item_id2delimiters: Cow::Borrowed(item_id2delimiters),
id2public_import_paths: Cow::Borrowed(id2public_import_paths),
id2private_import_paths: Cow::Borrowed(id2private_import_paths),
import_path2id: Cow::Borrowed(import_path2id),
re_exports: Cow::Borrowed(re_exports),
Ok(Some(krate))
}
.hydrate(package_metadata.id().to_owned())
.context("Failed to re-hydrate the stored docs")?;

Ok(Some(krate))
let outcome = _get(package_metadata, cargo_fingerprint, connection);
match &outcome {
Ok(Some(_)) => {
tracing::Span::current().record("hit", true);
}
Ok(None) => {
tracing::Span::current().record("hit", false);
}
_ => {}
}
outcome
}

/// Store the JSON documentation generated by `rustdoc` in the cache.
#[instrument(name = "Cache rustdoc output on disk", skip_all, level=tracing::Level::DEBUG, fields(crate.name = %package_metadata.name()))]
#[instrument(
name = "Cache third-party crate docs to disk",
skip_all,
level=tracing::Level::DEBUG,
fields(crate.id = %package_metadata.id(), cache_key = tracing::field::Empty))
]
fn insert(
&self,
package_metadata: &PackageMetadata,
Expand All @@ -444,12 +473,14 @@ impl ThirdPartyCrateCache {
else {
return Ok(());
};
tracing::Span::current().record("cache_key", tracing::field::debug(&cache_key));
let cached_data = CachedData::new(krate).context("Failed to serialize docs")?;
let mut stmt = connection.prepare_cached(
"INSERT INTO rustdoc_3d_party_crates_cache (
crate_name,
crate_source,
crate_version,
crate_hash,
cargo_fingerprint,
rustdoc_options,
default_feature_is_enabled,
Expand All @@ -464,12 +495,16 @@ impl ThirdPartyCrateCache {
id2private_import_paths,
import_path2id,
re_exports
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
)?;
stmt.execute(params![
cache_key.crate_name,
cache_key.crate_source,
cache_key.crate_version,
// `NULL` values are considered to be distinct from all other values
// by SQLite, including other `NULL`s. Therefore we use an empty
// string as a placeholder for `NULL` values.
cache_key.crate_hash.unwrap_or_default(),
cache_key.cargo_fingerprint,
cache_key.rustdoc_options,
cache_key.default_feature_is_enabled,
Expand All @@ -494,6 +529,7 @@ impl ThirdPartyCrateCache {
crate_name TEXT NOT NULL,
crate_source TEXT NOT NULL,
crate_version TEXT NOT NULL,
crate_hash TEXT NOT NULL,
cargo_fingerprint TEXT NOT NULL,
rustdoc_options TEXT NOT NULL,
default_feature_is_enabled INTEGER NOT NULL,
Expand All @@ -508,7 +544,7 @@ impl ThirdPartyCrateCache {
id2private_import_paths BLOB NOT NULL,
import_path2id BLOB NOT NULL,
re_exports BLOB NOT NULL,
PRIMARY KEY (crate_name, crate_source, crate_version, cargo_fingerprint, rustdoc_options, default_feature_is_enabled, active_named_features)
PRIMARY KEY (crate_name, crate_source, crate_version, crate_hash, cargo_fingerprint, rustdoc_options, default_feature_is_enabled, active_named_features)
)",
[]
)?;
Expand Down Expand Up @@ -640,6 +676,9 @@ pub(super) struct ThirdPartyCrateCacheKey<'a> {
pub crate_name: &'a str,
pub crate_source: &'a str,
pub crate_version: String,
/// The hash of the crate's source code, computed via BLAKE3.
/// It is only populated for path dependencies.
pub crate_hash: Option<String>,
pub cargo_fingerprint: &'a str,
pub rustdoc_options: String,
pub default_feature_is_enabled: bool,
Expand All @@ -652,10 +691,33 @@ impl<'a> ThirdPartyCrateCacheKey<'a> {
package_metadata: &'a PackageMetadata<'a>,
cargo_fingerprint: &'a str,
) -> Option<ThirdPartyCrateCacheKey<'a>> {
// We don't want to cache the docs for workspace crates and path dependencies.
let Some(source) = package_metadata.source().external_source() else {
return None;
let source = match package_metadata.source() {
guppy::graph::PackageSource::Workspace(_) => {
// We don't want to cache the docs for workspace crates.
return None;
}
guppy::graph::PackageSource::Path(p) => p.as_str(),
guppy::graph::PackageSource::External(e) => e,
};
let crate_hash =
if let guppy::graph::PackageSource::Path(package_path) = package_metadata.source() {
// We need to compute the hash of the package's contents,
// to invalidate the cache when the package changes.
// This is only relevant for path dependencies.
// We don't need to do this for external dependencies,
// since they are assumed to be immutable.
let Ok(hash) = checksum_crate(package_path) else {
tracing::warn!(
"Failed to compute the hash of the package at {:?}.
I won't cache its JSON documentation to avoid serving stale data.",
package_metadata.id()
);
return None;
};
Some(hash.to_string())
} else {
None
};
let features = package_metadata
.to_feature_set(StandardFeatures::Default)
.features_for(package_metadata.id())
Expand All @@ -669,6 +731,7 @@ impl<'a> ThirdPartyCrateCacheKey<'a> {
crate_name: package_metadata.name(),
crate_source: source,
crate_version: package_metadata.version().to_string(),
crate_hash,
cargo_fingerprint,
default_feature_is_enabled,
// SQLite doesn't support arrays, so we have to serialize these two collections as strings.
Expand Down
Loading