-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add support for operating on byte strings
This adds a `bytes` submodule for operating on byte strings that might contain invalid UTF-8. Where possible I have switched the functions that operate on `str` to use the `bytes` functions internally to avoid duplicating code and eliminate the potential for differing behavior between the two functions. It includes trivial tests that confirm that the `bytes` version of functions actually work on invalid UTF-8. Fixes #12
- Loading branch information
1 parent
aa2d6e3
commit 879d212
Showing
4 changed files
with
284 additions
and
21 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
// Copyright 2015 Nicholas Allegra (comex). | ||
// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or | ||
// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be | ||
// copied, modified, or distributed except according to those terms. | ||
|
||
//! [`Shlex`] and friends for byte strings. | ||
//! | ||
//! This may be more convenient if you are working with byte slices (`[u8]`) | ||
//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr): | ||
//! | ||
//! ```rust | ||
//! #[cfg(unix)] { | ||
//! use shlex::bytes::quote; | ||
//! use std::ffi::OsStr; | ||
//! use std::os::unix::ffi::OsStrExt; | ||
//! | ||
//! // `\x80` is invalid in UTF-8. | ||
//! let os_str = OsStr::from_bytes(b"a\x80b c"); | ||
//! assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]); | ||
//! } | ||
//! ``` | ||
//! | ||
//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.) | ||
extern crate alloc; | ||
use alloc::vec::Vec; | ||
use alloc::borrow::Cow; | ||
#[cfg(test)] | ||
use alloc::vec; | ||
#[cfg(test)] | ||
use alloc::borrow::ToOwned; | ||
|
||
/// An iterator that takes an input byte string and splits it into the words using the same syntax as | ||
/// the POSIX shell. | ||
pub struct Shlex<'a> { | ||
in_iter: core::slice::Iter<'a, u8>, | ||
/// The number of newlines read so far, plus one. | ||
pub line_no: usize, | ||
/// An input string is erroneous if it ends while inside a quotation or right after an | ||
/// unescaped backslash. Since Iterator does not have a mechanism to return an error, if that | ||
/// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to | ||
/// true; best to check it after you're done iterating. | ||
pub had_error: bool, | ||
} | ||
|
||
impl<'a> Shlex<'a> { | ||
pub fn new(in_bytes: &'a [u8]) -> Self { | ||
Shlex { | ||
in_iter: in_bytes.iter(), | ||
line_no: 1, | ||
had_error: false, | ||
} | ||
} | ||
|
||
fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> { | ||
let mut result: Vec<u8> = Vec::new(); | ||
loop { | ||
match ch as char { | ||
'"' => if let Err(()) = self.parse_double(&mut result) { | ||
self.had_error = true; | ||
return None; | ||
}, | ||
'\'' => if let Err(()) = self.parse_single(&mut result) { | ||
self.had_error = true; | ||
return None; | ||
}, | ||
'\\' => if let Some(ch2) = self.next_char() { | ||
if ch2 != '\n' as u8 { result.push(ch2); } | ||
} else { | ||
self.had_error = true; | ||
return None; | ||
}, | ||
' ' | '\t' | '\n' => { break; }, | ||
_ => { result.push(ch as u8); }, | ||
} | ||
if let Some(ch2) = self.next_char() { ch = ch2; } else { break; } | ||
} | ||
Some(result) | ||
} | ||
|
||
fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> { | ||
loop { | ||
if let Some(ch2) = self.next_char() { | ||
match ch2 as char { | ||
'\\' => { | ||
if let Some(ch3) = self.next_char() { | ||
match ch3 as char { | ||
// \$ => $ | ||
'$' | '`' | '"' | '\\' => { result.push(ch3); }, | ||
// \<newline> => nothing | ||
'\n' => {}, | ||
// \x => =x | ||
_ => { result.push('\\' as u8); result.push(ch3); } | ||
} | ||
} else { | ||
return Err(()); | ||
} | ||
}, | ||
'"' => { return Ok(()); }, | ||
_ => { result.push(ch2); }, | ||
} | ||
} else { | ||
return Err(()); | ||
} | ||
} | ||
} | ||
|
||
fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> { | ||
loop { | ||
if let Some(ch2) = self.next_char() { | ||
match ch2 as char { | ||
'\'' => { return Ok(()); }, | ||
_ => { result.push(ch2); }, | ||
} | ||
} else { | ||
return Err(()); | ||
} | ||
} | ||
} | ||
|
||
fn next_char(&mut self) -> Option<u8> { | ||
let res = self.in_iter.next().copied(); | ||
if res == Some(b'\n') { self.line_no += 1; } | ||
res | ||
} | ||
} | ||
|
||
impl<'a> Iterator for Shlex<'a> { | ||
type Item = Vec<u8>; | ||
fn next(&mut self) -> Option<Self::Item> { | ||
if let Some(mut ch) = self.next_char() { | ||
// skip initial whitespace | ||
loop { | ||
match ch as char { | ||
' ' | '\t' | '\n' => {}, | ||
'#' => { | ||
while let Some(ch2) = self.next_char() { | ||
if ch2 as char == '\n' { break; } | ||
} | ||
}, | ||
_ => { break; } | ||
} | ||
if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; } | ||
} | ||
self.parse_word(ch) | ||
} else { // no initial character | ||
None | ||
} | ||
} | ||
|
||
} | ||
|
||
/// Convenience function that consumes the whole byte string at once. Returns None if the input was | ||
/// erroneous. | ||
pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> { | ||
let mut shl = Shlex::new(in_bytes); | ||
let res = shl.by_ref().collect(); | ||
if shl.had_error { None } else { Some(res) } | ||
} | ||
|
||
/// Given a single word, return a byte string suitable to encode it as a shell argument. | ||
/// | ||
/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only | ||
/// ever inserts valid ASCII characters before or after existing ASCII characters (or | ||
/// returns two double quotes if the input was an empty string). It will never modify a | ||
/// multibyte UTF-8 character. | ||
pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> { | ||
if in_bytes.len() == 0 { | ||
b"\"\""[..].into() | ||
} else if in_bytes.iter().any(|c| match *c as char { | ||
'|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' | | ||
'\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true, | ||
_ => false | ||
}) { | ||
let mut out: Vec<u8> = Vec::new(); | ||
out.push(b'"'); | ||
for &c in in_bytes { | ||
match c { | ||
b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'), | ||
_ => () | ||
} | ||
out.push(c); | ||
} | ||
out.push(b'"'); | ||
out.into() | ||
} else { | ||
in_bytes.into() | ||
} | ||
} | ||
|
||
/// Convenience function that consumes an iterable of words and turns it into a single byte string, | ||
/// quoting words when necessary. Consecutive words will be separated by a single space. | ||
pub fn join<'a, I: core::iter::IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> { | ||
words.into_iter() | ||
.map(quote) | ||
.collect::<Vec<_>>() | ||
.join(&b' ') | ||
} | ||
|
||
#[cfg(test)] | ||
const INVALID_UTF8: &[u8] = b"\xa1"; | ||
|
||
#[test] | ||
fn test_invalid_utf8() { | ||
// Check that our test string is actually invalid UTF-8. | ||
assert!(core::str::from_utf8(INVALID_UTF8).is_err()); | ||
} | ||
|
||
#[cfg(test)] | ||
static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[ | ||
(b"foo$baz", Some(&[b"foo$baz"])), | ||
(b"foo baz", Some(&[b"foo", b"baz"])), | ||
(b"foo\"bar\"baz", Some(&[b"foobarbaz"])), | ||
(b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])), | ||
(b" foo \nbar", Some(&[b"foo", b"bar"])), | ||
(b"foo\\\nbar", Some(&[b"foobar"])), | ||
(b"\"foo\\\nbar\"", Some(&[b"foobar"])), | ||
(b"'baz\\$b'", Some(&[b"baz\\$b"])), | ||
(b"'baz\\\''", None), | ||
(b"\\", None), | ||
(b"\"\\", None), | ||
(b"'\\", None), | ||
(b"\"", None), | ||
(b"'", None), | ||
(b"foo #bar\nbaz", Some(&[b"foo", b"baz"])), | ||
(b"foo #bar", Some(&[b"foo"])), | ||
(b"foo#bar", Some(&[b"foo#bar"])), | ||
(b"foo\"#bar", None), | ||
(b"'\\n'", Some(&[b"\\n"])), | ||
(b"'\\\\n'", Some(&[b"\\\\n"])), | ||
(INVALID_UTF8, Some(&[INVALID_UTF8])), | ||
]; | ||
|
||
#[test] | ||
fn test_split() { | ||
for &(input, output) in SPLIT_TEST_ITEMS { | ||
assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect())); | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_lineno() { | ||
let mut sh = Shlex::new(b"\nfoo\nbar"); | ||
while let Some(word) = sh.next() { | ||
if word == b"bar" { | ||
assert_eq!(sh.line_no, 3); | ||
} | ||
} | ||
} | ||
|
||
#[test] | ||
fn test_quote() { | ||
assert_eq!(quote(b"foobar"), &b"foobar"[..]); | ||
assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]); | ||
assert_eq!(quote(b"\""), &b"\"\\\"\""[..]); | ||
assert_eq!(quote(b""), &b"\"\""[..]); | ||
assert_eq!(quote(INVALID_UTF8), INVALID_UTF8); | ||
} | ||
|
||
#[test] | ||
fn test_join() { | ||
assert_eq!(join(vec![]), &b""[..]); | ||
assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]); | ||
assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]); | ||
assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]); | ||
assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters