Skip to content

Commit

Permalink
Exclude the datastore directory from scanning
Browse files Browse the repository at this point in the history
Fixes #32.
  • Loading branch information
bradlarsen committed Feb 22, 2023
1 parent 8146fdb commit 25088e8
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 20 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- When scanning, Git repositories are now opened twice: once at input enumeration time, and once at scanning time.
This drastically reduces the amount of memory required to scan a large number of Git repositories.

### Fixes
- When scanning, the datastore is now explicitly excluded from filesystem enumeration.
This ensures that files used internally for Nosey Parker's operation are not inadvertently scanned ([#32](https://github.com/praetorian-inc/noseyparker/issues/32)).


## [v0.11.0](https://github.com/praetorian-inc/noseyparker/releases/v0.11.0) (2022-12-30)

Expand Down
52 changes: 39 additions & 13 deletions src/bin/noseyparker/cmd_scan.rs
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
use anyhow::{Context, Result, bail};
use anyhow::{bail, Context, Result};
use indicatif::{HumanBytes, HumanCount, HumanDuration};
use rayon::prelude::*;
use std::str::FromStr;
use std::sync::mpsc;
use std::sync::Mutex;
use std::time::Instant;
use tracing::{debug, debug_span, info, error, warn};
use tracing::{debug, debug_span, error, info, warn};

use crate::args;

use noseyparker::blob::Blob;
use noseyparker::blob_id_set::BlobIdSet;
use noseyparker::datastore::Datastore;
use noseyparker::defaults::DEFAULT_IGNORE_RULES;
use noseyparker::github;
use noseyparker::git_binary::Git;
use noseyparker::git_url::GitUrl;
use noseyparker::github;
use noseyparker::input_enumerator::{open_git_repo, FileResult, FilesystemEnumerator};
use noseyparker::location;
use noseyparker::match_type::Match;
Expand Down Expand Up @@ -51,7 +51,6 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
// Open datastore
// ---------------------------------------------------------------------------------------------
let mut datastore = Datastore::create_or_open(&args.datastore)?;
let tmpdir = datastore.tmpdir();

// ---------------------------------------------------------------------------------------------
// Load rules
Expand Down Expand Up @@ -102,7 +101,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
let mut input_roots = args.input_args.path_inputs.clone();

if !repo_urls.is_empty() {
let clones_dir = tmpdir.join("clones");
let clones_dir = datastore.clones_dir();
let git = Git::new();

// FIXME: put a progress meter around this; disable Git's built-in progress
Expand All @@ -117,7 +116,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
warn!("Skipping scan of {repo_url}");
});
progress.inc(1);
continue
continue;
}
Ok(output_dir) => output_dir,
};
Expand All @@ -131,11 +130,21 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
input_roots.push(output_dir);
progress.inc(1);
continue;
},
}
Err(e) => {
progress.suspend(|| warn!("Failed to update clone of {repo_url} at {}: {e}", output_dir.display()));
if let Err(e) = std::fs::remove_dir_all(&output_dir) {
progress.suspend(|| error!("Failed to remove clone directory at {}: {e}", output_dir.display()));
progress.suspend(|| {
warn!(
"Failed to update clone of {repo_url} at {}: {e}",
output_dir.display()
)
});
if let Err(e) = std::fs::remove_dir_all(&output_dir) {
progress.suspend(|| {
error!(
"Failed to remove clone directory at {}: {e}",
output_dir.display()
)
});
}
}
}
Expand Down Expand Up @@ -175,10 +184,11 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
// Load default ignore file. Note that we have to write it to a file first,
// because the API for the `ignore` crate doesn't expose something that takes a
// string.
let ignore_path = tmpdir.join("default_ignore_rules.conf");
let ignore_path = datastore.scratch_dir().join("default_ignore_rules.conf");
std::fs::write(&ignore_path, DEFAULT_IGNORE_RULES).with_context(|| {
format!("Failed to write default ignore rules to {}", ignore_path.display())
})?;

ie.add_ignore(&ignore_path).with_context(|| {
format!("Failed to load ignore rules from {}", ignore_path.display())
})?;
Expand All @@ -191,6 +201,23 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
})?;
}

// Make sure the datastore itself is not scanned
let datastore_path = std::fs::canonicalize(datastore.root_dir())?;
ie.filter_entry(move |entry| {
let path = match std::fs::canonicalize(entry.path()) {
Err(e) => {
warn!("Failed to canonicalize path {}: {}", entry.path().display(), e);
return true;
}
Ok(p) => p,
};
if &path != &datastore_path {
true
} else {
false
}
});

Ok(ie)
}()
.context("Failed to initialize filesystem enumerator")?;
Expand Down Expand Up @@ -277,6 +304,7 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
datastore
.analyze()
.expect("should be able to analyze the database");
// FIXME: `num_added` is not computed correctly
(datastore, num_matches, num_added as u64)
})
.expect("should be able to start datastore writer thread")
Expand Down Expand Up @@ -461,8 +489,6 @@ pub fn run(global_args: &args::GlobalArgs, args: &args::ScanArgs) -> Result<()>
Ok(())
}



/// Get a path for a local clone of the given git URL underneath `root`.
fn clone_destination(root: &std::path::Path, repo: &GitUrl) -> Result<std::path::PathBuf> {
Ok(root.join(repo.to_path_buf()))
Expand Down
26 changes: 20 additions & 6 deletions src/datastore.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,20 @@ impl Datastore {
ds.migrate()
.with_context(|| format!("Failed to migrate database at {}", db_path.display()))?;

let tmpdir = ds.tmpdir();
std::fs::create_dir_all(&tmpdir).with_context(|| {
let scratch_dir = ds.scratch_dir();
std::fs::create_dir_all(&scratch_dir).with_context(|| {
format!(
"Failed to create temporary directory {} for datastore at {}",
tmpdir.display(),
"Failed to create scratch directory {} for datastore at {}",
scratch_dir.display(),
ds.root_dir().display()
)
})?;

let clones_dir = ds.clones_dir();
std::fs::create_dir_all(&clones_dir).with_context(|| {
format!(
"Failed to create clones directory {} for datastore at {}",
clones_dir.display(),
ds.root_dir().display()
)
})?;
Expand All @@ -73,11 +82,16 @@ impl Datastore {
Self::open(root_dir)
}

/// Get the path to this datastore's temporary directory.
pub fn tmpdir(&self) -> PathBuf {
/// Get the path to this datastore's scratch directory.
pub fn scratch_dir(&self) -> PathBuf {
self.root_dir.join("scratch")
}

/// Get the path to this datastore's clones directory.
pub fn clones_dir(&self) -> PathBuf {
self.root_dir.join("clones")
}

fn new_connection(path: &Path) -> Result<Connection> {
let conn = Connection::open(path)?;

Expand Down
10 changes: 9 additions & 1 deletion src/input_enumerator.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
use anyhow::{Context, Result};
use ignore::{WalkBuilder, WalkState};
use ignore::{DirEntry, WalkBuilder, WalkState};
use std::path::{Path, PathBuf};
use std::sync::Mutex;
use tracing::{debug, error, warn};
Expand Down Expand Up @@ -224,6 +224,14 @@ impl FilesystemEnumerator {
self
}

pub fn filter_entry<P>(&mut self, filter: P) -> &mut Self
where
P: Fn(&DirEntry) -> bool + Send + Sync + 'static
{
self.walk_builder.filter_entry(filter);
self
}

pub fn run(&self, progress: &Progress) -> Result<FilesystemEnumeratorResult> {
let files = Mutex::new(Vec::new());
let git_repos = Mutex::new(Vec::new());
Expand Down

0 comments on commit 25088e8

Please sign in to comment.