Skip to content

Commit

Permalink
fix(cli): Ignore (do not abort on) non-utf8 files when walking recurs…
Browse files Browse the repository at this point in the history
…ively

Previously a hard error, but safest bet seems to
ignore. Ripgrep handles binary data with NUL byte
detection, which does not seem necessary here

Closes #166
  • Loading branch information
alexpovel committed Nov 5, 2024
1 parent fc43a92 commit b3b9d6e
Show file tree
Hide file tree
Showing 14 changed files with 182 additions and 4 deletions.
32 changes: 29 additions & 3 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use anyhow::{Context, Result};
use colored::Colorize;
use ignore::{WalkBuilder, WalkState};
use itertools::Itertools;
use log::{debug, error, info, trace, LevelFilter};
use log::{debug, error, info, trace, warn, LevelFilter};
use pathdiff::diff_paths;
#[cfg(feature = "german")]
use srgn::actions::German;
Expand Down Expand Up @@ -361,6 +361,8 @@ fn handle_actions_on_many_files_sorted(

n_files_processed += match res {
Ok(()) => 1,

// Soft errors with reasonable handling available:
Err(PathProcessingError::NotAFile | PathProcessingError::InvalidFile) => 0,
Err(PathProcessingError::ApplicationError(ApplicationError::SomeInScope))
if global_options.fail_any =>
Expand All @@ -379,13 +381,24 @@ fn handle_actions_on_many_files_sorted(
trace!("Detected broken pipe, stopping search.");
break;
}
Err(PathProcessingError::IoError(e, _))
// `InvalidData` does NOT equal "invalid utf-8", but that's how
// it's _effectively_ used in the "read to string" type of
// functions we use throughout.
// https://github.com/rust-lang/rust/blob/096277e989d6de11c3077472fc05778e261e7b8e/library/std/src/io/error.rs#L78-L79
if e.kind() == io::ErrorKind::InvalidData =>
{
warn!("File contains unreadable data (binary? invalid utf-8?), skipped: {}", path.display());
0
}

// Hard errors we should do something about:
Err(
e @ (PathProcessingError::ApplicationError(ApplicationError::ActionError(
..,
))
| PathProcessingError::IoError(..)),
) => {
// Hard errors we should do something about.
if search_mode {
error!("Error walking at {}: {}", path.display(), e);
0
Expand Down Expand Up @@ -478,6 +491,8 @@ fn handle_actions_on_many_files_threaded(
*n_files_processed.lock().unwrap() += 1;
WalkState::Continue
}

// Soft errors with reasonable handling available:
Err(PathProcessingError::NotAFile | PathProcessingError::InvalidFile) => {
WalkState::Continue
}
Expand All @@ -499,11 +514,22 @@ fn handle_actions_on_many_files_threaded(
trace!("Detected broken pipe, stopping search.");
WalkState::Quit
}
Err(PathProcessingError::IoError(e, _))
// `InvalidData` does NOT equal "invalid utf-8", but that's
// how it's _effectively_ used in the "read to string" type
// of functions we use throughout.
// https://github.com/rust-lang/rust/blob/096277e989d6de11c3077472fc05778e261e7b8e/library/std/src/io/error.rs#L78-L79
if e.kind() == io::ErrorKind::InvalidData =>
{
warn!("File contains unreadable data (binary? invalid utf-8?), skipped: {}", path.display());
WalkState::Continue
}

// Hard errors we should do something about:
Err(
e @ (PathProcessingError::ApplicationError(..)
| PathProcessingError::IoError(..)),
) => {
// Hard errors we should do something about.
error!("Error walking at {} due to: {}", path.display(), e);

if search_mode {
Expand Down
33 changes: 32 additions & 1 deletion tests/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,32 @@ Heizoelrueckstossabdaempfung.
// breaking snapshot testing.
true,
)]
#[case::binary_data_sorted(
"binary-data-sorted",
"tests/files/binary-data/in",
&[
"--sorted",
"--glob",
"**/*",
"0a1a09c8-2995-4ac5-9d60-01a0f02920e8",
"gone"
],
false,
)]
#[case::binary_data_unsorted(
"binary-data-sorted",
"tests/files/binary-data/in",
&[
"--glob",
"**/*",
"0a1a09c8-2995-4ac5-9d60-01a0f02920e8",
"gone"
],
// NOT `--sorted`, so not deterministic; use to test that directories are
// equivalent even if running parallel, unsorted. Output will be random,
// breaking snapshot testing.
true,
)]
fn test_cli_files(
#[case] mut snapshot_name: String,
#[case] input: PathBuf,
Expand Down Expand Up @@ -439,7 +465,12 @@ Heizoelrueckstossabdaempfung.
// Assert

// Thing itself works
assert!(output.status.success(), "Binary execution itself failed");
output.status.success().then_some(()).ok_or_else(|| {
anyhow::anyhow!(
"Binary execution itself failed: {}",
String::from_utf8_lossy(&output.stderr).escape_debug()
)
})?;

// Do not drop on panic, to keep tmpdir in place for manual inspection. Can then
// diff directories.
Expand Down
3 changes: 3 additions & 0 deletions tests/files/binary-data/in/invalid-utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# DO NOT save this file in an editor: it will overwrite the invalid file marker with a
# literal �, which is valid utf-8
invalid utf8 �
3 changes: 3 additions & 0 deletions tests/files/binary-data/in/subdir/invalid-utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# DO NOT save this file in an editor: it will overwrite the invalid file marker with a
# literal �, which is valid utf-8
invalid utf8 �
2 changes: 2 additions & 0 deletions tests/files/binary-data/in/subdir/valid-utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
valid utf8
unique string for precise searching: 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
3 changes: 3 additions & 0 deletions tests/files/binary-data/out/invalid-utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# DO NOT save this file in an editor: it will overwrite the invalid file marker with a
# literal �, which is valid utf-8
invalid utf8 �
3 changes: 3 additions & 0 deletions tests/files/binary-data/out/subdir/invalid-utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# DO NOT save this file in an editor: it will overwrite the invalid file marker with a
# literal �, which is valid utf-8
invalid utf8 �
2 changes: 2 additions & 0 deletions tests/files/binary-data/out/subdir/valid-utf8
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
valid utf8
unique string for precise searching: gone
19 changes: 19 additions & 0 deletions tests/snapshots/cli__tests__binary-data-sorted-dry-run-linux.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
source: tests/cli.rs
expression: "CommandSnap\n{\n args, stdin: None, stdout:\n stdout.split_inclusive('\\n').map(ToOwned::to_owned).collect_vec(),\n exit_code,\n}"
info:
stderr: []
---
args:
- "--sorted"
- "--glob"
- "**/*"
- 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
- gone
stdin: ~
stdout:
- "subdir/valid-utf8\n"
- "2:unique string for precise searching: 0a1a09c8-2995-4ac5-9d60-01a0f02920e8\n"
- "2:unique string for precise searching: gone\n"
- "\n"
exit_code: 0
19 changes: 19 additions & 0 deletions tests/snapshots/cli__tests__binary-data-sorted-dry-run-macos.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
source: tests/cli.rs
expression: "CommandSnap\n{\n args, stdin: None, stdout:\n stdout.split_inclusive('\\n').map(ToOwned::to_owned).collect_vec(),\n exit_code,\n}"
info:
stderr: []
---
args:
- "--sorted"
- "--glob"
- "**/*"
- 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
- gone
stdin: ~
stdout:
- "subdir/valid-utf8\n"
- "2:unique string for precise searching: 0a1a09c8-2995-4ac5-9d60-01a0f02920e8\n"
- "2:unique string for precise searching: gone\n"
- "\n"
exit_code: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
---
source: tests/cli.rs
expression: "CommandSnap\n{\n args, stdin: None, stdout:\n stdout.split_inclusive('\\n').map(ToOwned::to_owned).collect_vec(),\n exit_code,\n}"
info:
stderr: []
---
args:
- "--sorted"
- "--glob"
- "**/*"
- 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
- gone
stdin: ~
stdout:
- "subdir\\valid-utf8\n"
- "2:unique string for precise searching: 0a1a09c8-2995-4ac5-9d60-01a0f02920e8\n"
- "2:unique string for precise searching: gone\n"
- "\n"
exit_code: 0
16 changes: 16 additions & 0 deletions tests/snapshots/cli__tests__binary-data-sorted-linux.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
source: tests/cli.rs
expression: "CommandSnap\n{\n args, stdin: None, stdout:\n stdout.split_inclusive('\\n').map(ToOwned::to_owned).collect_vec(),\n exit_code,\n}"
info:
stderr: []
---
args:
- "--sorted"
- "--glob"
- "**/*"
- 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
- gone
stdin: ~
stdout:
- "subdir/valid-utf8\n"
exit_code: 0
16 changes: 16 additions & 0 deletions tests/snapshots/cli__tests__binary-data-sorted-macos.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
source: tests/cli.rs
expression: "CommandSnap\n{\n args, stdin: None, stdout:\n stdout.split_inclusive('\\n').map(ToOwned::to_owned).collect_vec(),\n exit_code,\n}"
info:
stderr: []
---
args:
- "--sorted"
- "--glob"
- "**/*"
- 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
- gone
stdin: ~
stdout:
- "subdir/valid-utf8\n"
exit_code: 0
16 changes: 16 additions & 0 deletions tests/snapshots/cli__tests__binary-data-sorted-windows.snap
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
source: tests/cli.rs
expression: "CommandSnap\n{\n args, stdin: None, stdout:\n stdout.split_inclusive('\\n').map(ToOwned::to_owned).collect_vec(),\n exit_code,\n}"
info:
stderr: []
---
args:
- "--sorted"
- "--glob"
- "**/*"
- 0a1a09c8-2995-4ac5-9d60-01a0f02920e8
- gone
stdin: ~
stdout:
- "subdir\\valid-utf8\n"
exit_code: 0

0 comments on commit b3b9d6e

Please sign in to comment.