Skip to content

Commit

Permalink
Add support for operating on byte strings
Browse files Browse the repository at this point in the history
This adds a `bytes` submodule for operating on byte strings that might
contain invalid UTF-8. Where possible I have switched the functions that
operate on `str` to use the `bytes` functions internally to avoid
duplicating code and eliminate the potential for differing behavior
between the two functions.

It includes trivial tests that confirm that the `bytes` version of
functions actually work on invalid UTF-8.

Fixes #12
  • Loading branch information
danielparks committed Sep 4, 2023
1 parent aa2d6e3 commit 879d212
Show file tree
Hide file tree
Showing 4 changed files with 284 additions and 21 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# Next release

* Adds `bytes` module to support operating directly on byte strings.

# 1.1.0

* Adds the `std` feature (enabled by default)
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell:
This implementation also deviates from the Python version in not treating \r
specially, which I believe is more compliant.

The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate
over the bytes directly as a micro-optimization.
This crate can be used on either normal Rust strings, or on byte strings with
the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so
internally they all work on bytes directly as a micro-optimization.

Disabling the `std` feature (which is enabled by default) will allow the crate
to work in `no_std` environments, where the `alloc` crate, and a global
Expand Down
267 changes: 267 additions & 0 deletions src/bytes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
// Copyright 2015 Nicholas Allegra (comex).
// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
// copied, modified, or distributed except according to those terms.

//! [`Shlex`] and friends for byte strings.
//!
//! This may be more convenient if you are working with byte slices (`[u8]`)
//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
//!
//! ```rust
//! #[cfg(unix)] {
//! use shlex::bytes::quote;
//! use std::ffi::OsStr;
//! use std::os::unix::ffi::OsStrExt;
//!
//! // `\x80` is invalid in UTF-8.
//! let os_str = OsStr::from_bytes(b"a\x80b c");
//! assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]);
//! }
//! ```
//!
//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
extern crate alloc;
use alloc::vec::Vec;
use alloc::borrow::Cow;
#[cfg(test)]
use alloc::vec;
#[cfg(test)]
use alloc::borrow::ToOwned;

/// An iterator that takes an input byte string and splits it into the words using the same syntax as
/// the POSIX shell.
pub struct Shlex<'a> {
in_iter: core::slice::Iter<'a, u8>,
/// The number of newlines read so far, plus one.
pub line_no: usize,
/// An input string is erroneous if it ends while inside a quotation or right after an
/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that
/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
/// true; best to check it after you're done iterating.
pub had_error: bool,
}

impl<'a> Shlex<'a> {
pub fn new(in_bytes: &'a [u8]) -> Self {
Shlex {
in_iter: in_bytes.iter(),
line_no: 1,
had_error: false,
}
}

fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
let mut result: Vec<u8> = Vec::new();
loop {
match ch as char {
'"' => if let Err(()) = self.parse_double(&mut result) {
self.had_error = true;
return None;
},
'\'' => if let Err(()) = self.parse_single(&mut result) {
self.had_error = true;
return None;
},
'\\' => if let Some(ch2) = self.next_char() {
if ch2 != '\n' as u8 { result.push(ch2); }
} else {
self.had_error = true;
return None;
},
' ' | '\t' | '\n' => { break; },
_ => { result.push(ch as u8); },
}
if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
}
Some(result)
}

fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
loop {
if let Some(ch2) = self.next_char() {
match ch2 as char {
'\\' => {
if let Some(ch3) = self.next_char() {
match ch3 as char {
// \$ => $
'$' | '`' | '"' | '\\' => { result.push(ch3); },
// \<newline> => nothing
'\n' => {},
// \x => =x
_ => { result.push('\\' as u8); result.push(ch3); }
}
} else {
return Err(());
}
},
'"' => { return Ok(()); },
_ => { result.push(ch2); },
}
} else {
return Err(());
}
}
}

fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
loop {
if let Some(ch2) = self.next_char() {
match ch2 as char {
'\'' => { return Ok(()); },
_ => { result.push(ch2); },
}
} else {
return Err(());
}
}
}

fn next_char(&mut self) -> Option<u8> {
let res = self.in_iter.next().copied();
if res == Some(b'\n') { self.line_no += 1; }
res
}
}

impl<'a> Iterator for Shlex<'a> {
type Item = Vec<u8>;
fn next(&mut self) -> Option<Self::Item> {
if let Some(mut ch) = self.next_char() {
// skip initial whitespace
loop {
match ch as char {
' ' | '\t' | '\n' => {},
'#' => {
while let Some(ch2) = self.next_char() {
if ch2 as char == '\n' { break; }
}
},
_ => { break; }
}
if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
}
self.parse_word(ch)
} else { // no initial character
None
}
}

}

/// Convenience function that consumes the whole byte string at once. Returns None if the input was
/// erroneous.
pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
let mut shl = Shlex::new(in_bytes);
let res = shl.by_ref().collect();
if shl.had_error { None } else { Some(res) }
}

/// Given a single word, return a byte string suitable to encode it as a shell argument.
///
/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
/// ever inserts valid ASCII characters before or after existing ASCII characters (or
/// returns two double quotes if the input was an empty string). It will never modify a
/// multibyte UTF-8 character.
pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
if in_bytes.len() == 0 {
b"\"\""[..].into()
} else if in_bytes.iter().any(|c| match *c as char {
'|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
'\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
_ => false
}) {
let mut out: Vec<u8> = Vec::new();
out.push(b'"');
for &c in in_bytes {
match c {
b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'),
_ => ()
}
out.push(c);
}
out.push(b'"');
out.into()
} else {
in_bytes.into()
}
}

/// Convenience function that consumes an iterable of words and turns it into a single byte string,
/// quoting words when necessary. Consecutive words will be separated by a single space.
pub fn join<'a, I: core::iter::IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
words.into_iter()
.map(quote)
.collect::<Vec<_>>()
.join(&b' ')
}

#[cfg(test)]
const INVALID_UTF8: &[u8] = b"\xa1";

#[test]
fn test_invalid_utf8() {
// Check that our test string is actually invalid UTF-8.
assert!(core::str::from_utf8(INVALID_UTF8).is_err());
}

#[cfg(test)]
static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
(b"foo$baz", Some(&[b"foo$baz"])),
(b"foo baz", Some(&[b"foo", b"baz"])),
(b"foo\"bar\"baz", Some(&[b"foobarbaz"])),
(b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])),
(b" foo \nbar", Some(&[b"foo", b"bar"])),
(b"foo\\\nbar", Some(&[b"foobar"])),
(b"\"foo\\\nbar\"", Some(&[b"foobar"])),
(b"'baz\\$b'", Some(&[b"baz\\$b"])),
(b"'baz\\\''", None),
(b"\\", None),
(b"\"\\", None),
(b"'\\", None),
(b"\"", None),
(b"'", None),
(b"foo #bar\nbaz", Some(&[b"foo", b"baz"])),
(b"foo #bar", Some(&[b"foo"])),
(b"foo#bar", Some(&[b"foo#bar"])),
(b"foo\"#bar", None),
(b"'\\n'", Some(&[b"\\n"])),
(b"'\\\\n'", Some(&[b"\\\\n"])),
(INVALID_UTF8, Some(&[INVALID_UTF8])),
];

#[test]
fn test_split() {
for &(input, output) in SPLIT_TEST_ITEMS {
assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
}
}

#[test]
fn test_lineno() {
let mut sh = Shlex::new(b"\nfoo\nbar");
while let Some(word) = sh.next() {
if word == b"bar" {
assert_eq!(sh.line_no, 3);
}
}
}

#[test]
fn test_quote() {
assert_eq!(quote(b"foobar"), &b"foobar"[..]);
assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]);
assert_eq!(quote(b"\""), &b"\"\\\"\""[..]);
assert_eq!(quote(b""), &b"\"\""[..]);
assert_eq!(quote(INVALID_UTF8), INVALID_UTF8);
}

#[test]
fn test_join() {
assert_eq!(join(vec![]), &b""[..]);
assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]);
assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]);
assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]);
assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8);
}
29 changes: 10 additions & 19 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ use alloc::vec;
#[cfg(test)]
use alloc::borrow::ToOwned;

pub mod bytes;

/// An iterator that takes an input string and splits it into the words using the same syntax as
/// the POSIX shell.
pub struct Shlex<'a> {
Expand Down Expand Up @@ -159,26 +161,15 @@ pub fn split(in_str: &str) -> Option<Vec<String>> {

/// Given a single word, return a string suitable to encode it as a shell argument.
pub fn quote(in_str: &str) -> Cow<str> {
if in_str.len() == 0 {
"\"\"".into()
} else if in_str.bytes().any(|c| match c as char {
'|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
'\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
_ => false
}) {
let mut out: Vec<u8> = Vec::new();
out.push('"' as u8);
for c in in_str.bytes() {
match c as char {
'$' | '`' | '"' | '\\' => out.push('\\' as u8),
_ => ()
}
out.push(c);
match bytes::quote(in_str.as_bytes()) {
Cow::Borrowed(out) => {
// Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
unsafe { core::str::from_utf8_unchecked(out) }.into()
}
Cow::Owned(out) => {
// Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
unsafe { String::from_utf8_unchecked(out) }.into()
}
out.push('"' as u8);
unsafe { String::from_utf8_unchecked(out) }.into()
} else {
in_str.into()
}
}

Expand Down

0 comments on commit 879d212

Please sign in to comment.