Skip to content

Commit

Permalink
Merge pull request #593 from DataDog/jf/K9VULN-2162
Browse files Browse the repository at this point in the history
[K9VULN-2162] Add rule caching to datadog-static-analyzer-server binary
  • Loading branch information
jasonforal authored Jan 7, 2025
2 parents 3416ce2 + 9768ebd commit 3d1e3bc
Show file tree
Hide file tree
Showing 11 changed files with 384 additions and 150 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ codegen-units = 256
[workspace.dependencies]
anyhow = "1"
base64 = "0.21.2"
dashmap = "6.1.0"
git2 = "0.19.0"
indexmap = { version = "2.2", features = ["serde"] }
itertools = "0.12.1"
Expand Down
1 change: 1 addition & 0 deletions LICENSE-3rdparty.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ anyhow,https://crates.io/crates/anyhow,MIT,Copyright (c) 2019 David Tolnay
base64,https://github.com/marshallpierce/rust-base64,Apache-2.0,Copyright (c) 2015 Alice Maz
bstr,https://github.com/BurntSushi/bstr,MIT,Copyright (c) 2018-2019 Andrew Gallant
csv,https://github.com/BurntSushi/rust-csv,MIT,Copyright (c) 2015 Andrew Gallant
dashmap,https://github.com/xacrimon/dashmap,MIT,Copyright (c) 2019 Acrimon
deno-core,https://github.com/denoland/deno,MIT,Copyright 2018-2023 the Deno authors
futures,https://rust-lang.github.io/futures-rs/,MIT,Copyright (c) 2017 The Tokio Authors
git2,https://crates.io/crates/git2,MIT,Copyright (c) 2014 Alex Crichton
Expand Down
1 change: 1 addition & 0 deletions crates/bins/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ thiserror = { workspace = true }
tree-sitter = { workspace = true }

# other
dashmap = "6.1.0"
terminal-emoji = "0.4.1"
getopts = "0.2.21"
indicatif = "0.17.6"
Expand Down
3 changes: 3 additions & 0 deletions crates/bins/src/bin/datadog-static-analyzer-server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,15 @@
// This product includes software developed at Datadog (https://www.datadoghq.com/).
// Copyright 2024 Datadog, Inc.

use crate::datadog_static_analyzer_server::rule_cache::RuleCache;
use kernel::analysis::ddsa_lib::v8_platform::{initialize_v8, Initialized, V8Platform};
use std::sync::OnceLock;

mod datadog_static_analyzer_server;

pub(crate) static V8_PLATFORM: OnceLock<V8Platform<Initialized>> = OnceLock::new();
pub(crate) static RAYON_POOL: OnceLock<rayon::ThreadPool> = OnceLock::new();
pub(crate) static RULE_CACHE: OnceLock<RuleCache> = OnceLock::new();

#[rocket::main]
async fn main() {
Expand Down
8 changes: 8 additions & 0 deletions crates/bins/src/bin/datadog_static_analyzer_server/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ use tracing_subscriber::EnvFilter;
use crate::datadog_static_analyzer_server::error_codes::{
ERROR_CHANNEL_DISCONNECTED, ERROR_SHUTDOWN_FAILURE,
};
use crate::datadog_static_analyzer_server::rule_cache::RuleCache;
use crate::RULE_CACHE;

use super::state::ServerState;
use super::utils::get_current_timestamp_ms;
Expand Down Expand Up @@ -42,6 +44,7 @@ fn get_opts() -> Options {
opts.optflag("e", "enable-shutdown", "Enables the shutdown endpoint");
opts.optflag("h", "help", "Print this help");
opts.optflag("v", "version", "Shows the tool version");
opts.optflag("c", "use-rules-cache", "Pre-compile and cache rules");

opts.optopt(
"l",
Expand Down Expand Up @@ -171,6 +174,11 @@ pub fn prepare_rocket(tx_keep_alive_error: Sender<i32>) -> Result<RocketPreparat
// server state
tracing::debug!("Preparing the server state and rocket configuration");
let mut server_state = ServerState::new(matches.opt_str("s"), matches.opt_present("e"));
// Create a cache for rules, if requested
if matches.opt_present("use-rules-cache") {
RULE_CACHE.set(RuleCache::new()).expect("should init once");
}

let mut rocket_configuration = rocket::config::Config {
// disable rocket colors and emojis if we're logging to a file as we will be using json format
cli_colors: !matches.opt_present("l"),
Expand Down
15 changes: 9 additions & 6 deletions crates/bins/src/bin/datadog_static_analyzer_server/endpoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@ use std::cell::Cell;
use std::path::Path;

use crate::datadog_static_analyzer_server::fairings::TraceSpan;
use crate::{RAYON_POOL, V8_PLATFORM};
use crate::datadog_static_analyzer_server::rule_cache::cached_analysis_request;
use crate::{RAYON_POOL, RULE_CACHE, V8_PLATFORM};
use kernel::analysis::ddsa_lib::JsRuntime;
use rocket::{
fs::NamedFile,
Expand All @@ -11,11 +12,11 @@ use rocket::{
serde::json::{json, Json, Value},
Build, Rocket, Shutdown, State,
};
use server::model::analysis_request::ServerRule;
use server::model::analysis_response::AnalysisResponse;
use server::model::{
analysis_request::AnalysisRequest, tree_sitter_tree_request::TreeSitterRequest,
};
use server::request::process_analysis_request;
use server::tree_sitter_tree::process_tree_sitter_tree_request;
use thiserror::Error;

Expand Down Expand Up @@ -110,7 +111,7 @@ fn languages(span: TraceSpan) -> Value {
}

#[rocket::post("/analyze", format = "application/json", data = "<request>")]
async fn analyze(span: TraceSpan, request: Json<AnalysisRequest>) -> Value {
async fn analyze(span: TraceSpan, request: Json<AnalysisRequest<ServerRule>>) -> Value {
let _entered = span.enter();

rocket::tokio::task::spawn_blocking(|| {
Expand All @@ -125,11 +126,13 @@ async fn analyze(span: TraceSpan, request: Json<AnalysisRequest>) -> Value {
let v8 = V8_PLATFORM.get().expect("v8 should have been initialized");
v8.try_new_runtime().expect("ddsa init should succeed")
});
let request = request.into_inner();
let (rule_responses, errors) =
match process_analysis_request(request.into_inner(), runtime_ref) {
Ok(rule_responses) => (rule_responses, vec![]),
Err(err) => (vec![], vec![err]),
match cached_analysis_request(runtime_ref, request, RULE_CACHE.get()) {
Ok(resp) => (resp, vec![]),
Err(err) => (vec![], vec![err.to_string()]),
};

JS_RUNTIME.replace(opt);

json!(AnalysisResponse {
Expand Down
1 change: 1 addition & 0 deletions crates/bins/src/bin/datadog_static_analyzer_server/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ mod endpoints;
mod error_codes;
mod fairings;
mod ide;
pub(crate) mod rule_cache;
mod state;
mod utils;

Expand Down
256 changes: 256 additions & 0 deletions crates/bins/src/bin/datadog_static_analyzer_server/rule_cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
use dashmap::DashMap;
use kernel::analysis::ddsa_lib::JsRuntime;
use kernel::model::rule::RuleInternal;
use server::model::analysis_request::{AnalysisRequest, ServerRule};
use server::model::analysis_response::RuleResponse;
use server::request::process_analysis_request;
use std::sync::Arc;

/// A [`ServerRule`] with its corresponding pre-compiled [`RuleInternal`].
#[derive(Debug)]
pub struct CompiledServerRule {
rule: ServerRule,
/// The rule (from converted from the "raw" `ServerRule`), which has a pre-compiled tree-sitter query.
internal: RuleInternal,
}

impl CompiledServerRule {
/// Returns a reference to the [`ServerRule`] that this cache represents.
pub fn inner(&self) -> &ServerRule {
&self.rule
}

/// Returns a reference to the [`RuleInternal`] that this cache represents.
pub fn as_internal(&self) -> &RuleInternal {
&self.internal
}
}

/// A cache of compiled [`RuleInternal`], keyed by the rule name. Previously-cached values are overwritten
/// if there is a rule name collision.
#[derive(Debug)]
pub struct RuleCache(DashMap<String, Arc<CompiledServerRule>>);

/// A return value from looking up or inserting a value into a [`RuleCache`]. `is_update` will be `true`
/// if this value previously existed in the map and was updated, or `false` if either a
/// previous value was returned or the rule name has been cached for the first time.
#[derive(Debug)]
pub(crate) struct CachedValue {
pub(crate) rule: Arc<CompiledServerRule>,
pub(crate) is_update: bool,
}

impl RuleCache {
/// Returns a new `RuleCache` with a capacity of 0.
pub fn new() -> Self {
Self(DashMap::new())
}

/// Returns a [`CachedValue`] for the provided [`ServerRule`], utilizing a cache under the hood.
pub fn get_or_insert_from(&self, server_rule: ServerRule) -> Result<CachedValue, &'static str> {
if let Some(cached) = self.0.get(&server_rule.name) {
// (Note that even if the cache has this rule re-written after this function returns, the caller
// will still hold an owned reference to the original compiled rule, so there is no race).

// Only return the cached value if it's an exact match of the incoming `server_rule`.
// This check is necessary because the HashMap is keyed by rule name, so there
// could be a collision if the rule is updated.
if cached.rule == server_rule {
return Ok(CachedValue {
rule: Arc::clone(cached.value()),
is_update: false,
});
}
}
// If there was no cached rule (or the rule wasn't an exact match), compile and cache a new rule.
let rule_internal = RuleInternal::try_from(server_rule.clone())?;
let rule_name = server_rule.name.clone();
let compiled = Arc::new(CompiledServerRule {
rule: server_rule,
internal: rule_internal,
});
let cloned_rule = Arc::clone(&compiled);
let existing = self.0.insert(rule_name, compiled);
Ok(CachedValue {
rule: cloned_rule,
is_update: existing.is_some(),
})
}
}

/// Executes the [`AnalysisRequest`], optionally utilizing a cache for compiled rules.
pub(crate) fn cached_analysis_request(
runtime: &mut JsRuntime,
request: AnalysisRequest<ServerRule>,
cache: Option<&RuleCache>,
) -> Result<Vec<RuleResponse>, &'static str> {
if let Some(cache) = cache {
// A vec to hold compiled rules as owned values so their contained `RuleInternal` can be passed in as a Vec<&RuleInternal>.
// (This is necessary because `Arc<T>` doesn't impl `Borrow<T>`)
let mut compiled_rules = Vec::with_capacity(request.rules.len());
for server_rule in request.rules {
let cached = cache.get_or_insert_from(server_rule)?;
// (Since we've had a mutable reference to the `JsRuntime` for this whole scoped job,
// we know the runtime's v8::Script cache cannot change out from under us.
// Thus, there is no potential race in the time after clearing the cache
// but before actually executing the JavaScript rule).
if cached.is_update {
// The runtime's `v8::Script` cache is keyed only by name -- we need to manually clear it.
runtime.clear_rule_cache(&cached.rule.inner().name);
}
compiled_rules.push(cached.rule);
}
let rule_internals = compiled_rules
.iter()
.map(|rule| rule.as_internal())
.collect::<Vec<_>>();
let req_with_compiled: AnalysisRequest<&RuleInternal> = AnalysisRequest {
filename: request.filename,
language: request.language,
file_encoding: request.file_encoding,
code_base64: request.code_base64,
rules: rule_internals,
configuration_base64: request.configuration_base64,
options: request.options,
};
process_analysis_request(req_with_compiled, runtime)
} else {
let mut rule_internals = Vec::with_capacity(request.rules.len());
for server_rule in request.rules {
let rule_internal = RuleInternal::try_from(server_rule)?;
// The runtime's `v8::Script` cache is keyed only by name -- we need to manually clear it.
runtime.clear_rule_cache(&rule_internal.name);
rule_internals.push(rule_internal);
}
let req_with_internal: AnalysisRequest<RuleInternal> = AnalysisRequest {
filename: request.filename,
language: request.language,
file_encoding: request.file_encoding,
code_base64: request.code_base64,
rules: rule_internals,
configuration_base64: request.configuration_base64,
options: request.options,
};
process_analysis_request(req_with_internal, runtime)
}
}

#[cfg(test)]
mod tests {
use crate::datadog_static_analyzer_server::rule_cache::{cached_analysis_request, RuleCache};
use kernel::analysis::ddsa_lib;
use kernel::model::common::Language;
use kernel::model::rule::{compute_sha256, RuleCategory, RuleSeverity, RuleType};
use kernel::utils::encode_base64_string;
use server::model::analysis_request::{AnalysisRequest, ServerRule};

/// Tests that `cached_analysis_request` implements the (optional) cache properly.
/// When enabled, this requires:
/// * Updating the `RuleCache` cache
/// * Updating the thread-local JavaScript runtime's `v8::Script` cache
#[test]
fn test_cached_analysis_request() {
const RULE_NAME: &str = "ruleset/rule-name";
let v8 = ddsa_lib::test_utils::cfg_test_v8();

fn request_from(
rule_name: &str,
language: Language,
rule: (&str, &str),
) -> AnalysisRequest<ServerRule> {
let file_contents = "
def abc():
";
let code_base64 = encode_base64_string(rule.0.to_string());
let checksum = Some(compute_sha256(code_base64.as_bytes()));
let ts_query_b64 = encode_base64_string(rule.1.to_string());
let server_rule = ServerRule {
name: rule_name.to_string(),
short_description_base64: None,
description_base64: None,
category: Some(RuleCategory::BestPractices),
severity: Some(RuleSeverity::Warning),
language,
rule_type: RuleType::TreeSitterQuery,
entity_checked: None,
code_base64,
checksum,
pattern: None,
tree_sitter_query_base64: Some(ts_query_b64),
arguments: vec![],
};
AnalysisRequest {
filename: "file.py".to_string(),
language: Language::Python,
file_encoding: "utf-8".to_string(),
code_base64: encode_base64_string(file_contents.to_string()),
rules: vec![server_rule],
configuration_base64: None,
options: None,
}
}

let req_v1 = request_from(
RULE_NAME,
Language::Python,
(
// language=javascript
"\
function visit(captures) {
const func = captures.get('func');
addError(Violation.new('rule_v1', func));
}
",
"(function_definition) @func",
),
);
let req_v2 = request_from(
RULE_NAME,
Language::Python,
(
// language=javascript
"\
function visit(captures) {
const funcName = captures.get('name');
addError(Violation.new('rule_v2', funcName));
}
",
"(function_definition name: (identifier) @name)",
),
);

// Test invariants:
// Tests equality of rule name, which triggers a cache collision.
assert_eq!(req_v1.rules[0].name, req_v2.rules[0].name);
// Tests the clearing of `v8::Script` cache
assert_ne!(req_v1.rules[0].code_base64, req_v2.rules[0].code_base64);
// Tests the clearing of `RuleCache` cache`
assert_ne!(
req_v1.rules[0].tree_sitter_query_base64,
req_v2.rules[0].tree_sitter_query_base64
);

for test_case in [None, Some(&RuleCache::new())] {
let mut rt = v8.new_runtime();
let rule_responses =
cached_analysis_request(&mut rt, req_v1.clone(), test_case).unwrap();
assert_eq!(rule_responses[0].violations[0].0.message, "rule_v1");
if let Some(cache) = test_case {
assert_eq!(
cache.0.get(RULE_NAME).unwrap().rule.code_base64,
req_v1.rules[0].code_base64
);
}

let rule_responses =
cached_analysis_request(&mut rt, req_v2.clone(), test_case).unwrap();
assert_eq!(rule_responses[0].violations[0].0.message, "rule_v2");
if let Some(cache) = test_case {
assert_eq!(
cache.0.get(RULE_NAME).unwrap().rule.code_base64,
req_v2.rules[0].code_base64
);
}
}
}
}
Loading

0 comments on commit 3d1e3bc

Please sign in to comment.