Add support for operating on byte strings

This adds a `bytes` submodule for operating on byte strings that might contain invalid UTF-8. Where possible I have switched the functions that operate on `str` to use the `bytes` functions internally to avoid duplicating code and eliminate the potential for differing behavior between the two functions. It includes trivial tests that confirm that the `bytes` version of functions actually work on invalid UTF-8. Fixes #12
comex · Sep 4, 2023 · 879d212 · 879d212
1 parent aa2d6e3
commit 879d212
Show file tree

Hide file tree

Showing 4 changed files with 284 additions and 21 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+# Next release
+
+* Adds `bytes` module to support operating directly on byte strings.
+
 # 1.1.0
 
 * Adds the `std` feature (enabled by default)

diff --git a/README.md b/README.md
@@ -16,8 +16,9 @@ You only get the default settings of shlex.split, which mimic the POSIX shell:
 This implementation also deviates from the Python version in not treating \r
 specially, which I believe is more compliant.
 
-The algorithms in this crate are oblivious to UTF-8 high bytes, so they iterate
-over the bytes directly as a micro-optimization.
+This crate can be used on either normal Rust strings, or on byte strings with
+the `bytes` module. The algorithms used are oblivious to UTF-8 high bytes, so
+internally they all work on bytes directly as a micro-optimization.
 
 Disabling the `std` feature (which is enabled by default) will allow the crate
 to work in `no_std` environments, where the `alloc` crate, and a global

diff --git a/src/bytes.rs b/src/bytes.rs
@@ -0,0 +1,267 @@
+// Copyright 2015 Nicholas Allegra (comex).
+// Licensed under the Apache License, Version 2.0 <https://www.apache.org/licenses/LICENSE-2.0> or
+// the MIT license <https://opensource.org/licenses/MIT>, at your option. This file may not be
+// copied, modified, or distributed except according to those terms.
+
+//! [`Shlex`] and friends for byte strings.
+//!
+//! This may be more convenient if you are working with byte slices (`[u8]`)
+//! or types that are wrappers around bytes, such as [`OsStr`](std::ffi::OsStr):
+//!
+//! ```rust
+//! #[cfg(unix)] {
+//!     use shlex::bytes::quote;
+//!     use std::ffi::OsStr;
+//!     use std::os::unix::ffi::OsStrExt;
+//!
+//!     // `\x80` is invalid in UTF-8.
+//!     let os_str = OsStr::from_bytes(b"a\x80b c");
+//!     assert_eq!(quote(os_str.as_bytes()), &b"\"a\x80b c\""[..]);
+//! }
+//! ```
+//!
+//! (On Windows, `OsStr` uses 16 bit wide characters so this will not work.)
+
+extern crate alloc;
+use alloc::vec::Vec;
+use alloc::borrow::Cow;
+#[cfg(test)]
+use alloc::vec;
+#[cfg(test)]
+use alloc::borrow::ToOwned;
+
+/// An iterator that takes an input byte string and splits it into the words using the same syntax as
+/// the POSIX shell.
+pub struct Shlex<'a> {
+    in_iter: core::slice::Iter<'a, u8>,
+    /// The number of newlines read so far, plus one.
+    pub line_no: usize,
+    /// An input string is erroneous if it ends while inside a quotation or right after an
+    /// unescaped backslash.  Since Iterator does not have a mechanism to return an error, if that
+    /// happens, Shlex just throws out the last token, ends the iteration, and sets 'had_error' to
+    /// true; best to check it after you're done iterating.
+    pub had_error: bool,
+}
+
+impl<'a> Shlex<'a> {
+    pub fn new(in_bytes: &'a [u8]) -> Self {
+        Shlex {
+            in_iter: in_bytes.iter(),
+            line_no: 1,
+            had_error: false,
+        }
+    }
+
+    fn parse_word(&mut self, mut ch: u8) -> Option<Vec<u8>> {
+        let mut result: Vec<u8> = Vec::new();
+        loop {
+            match ch as char {
+                '"' => if let Err(()) = self.parse_double(&mut result) {
+                    self.had_error = true;
+                    return None;
+                },
+                '\'' => if let Err(()) = self.parse_single(&mut result) {
+                    self.had_error = true;
+                    return None;
+                },
+                '\\' => if let Some(ch2) = self.next_char() {
+                    if ch2 != '\n' as u8 { result.push(ch2); }
+                } else {
+                    self.had_error = true;
+                    return None;
+                },
+                ' ' | '\t' | '\n' => { break; },
+                _ => { result.push(ch as u8); },
+            }
+            if let Some(ch2) = self.next_char() { ch = ch2; } else { break; }
+        }
+        Some(result)
+    }
+
+    fn parse_double(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+        loop {
+            if let Some(ch2) = self.next_char() {
+                match ch2 as char {
+                    '\\' => {
+                        if let Some(ch3) = self.next_char() {
+                            match ch3 as char {
+                                // \$ => $
+                                '$' | '`' | '"' | '\\' => { result.push(ch3); },
+                                // \<newline> => nothing
+                                '\n' => {},
+                                // \x => =x
+                                _ => { result.push('\\' as u8); result.push(ch3); }
+                            }
+                        } else {
+                            return Err(());
+                        }
+                    },
+                    '"' => { return Ok(()); },
+                    _ => { result.push(ch2); },
+                }
+            } else {
+                return Err(());
+            }
+        }
+    }
+
+    fn parse_single(&mut self, result: &mut Vec<u8>) -> Result<(), ()> {
+        loop {
+            if let Some(ch2) = self.next_char() {
+                match ch2 as char {
+                    '\'' => { return Ok(()); },
+                    _ => { result.push(ch2); },
+                }
+            } else {
+                return Err(());
+            }
+        }
+    }
+
+    fn next_char(&mut self) -> Option<u8> {
+        let res = self.in_iter.next().copied();
+        if res == Some(b'\n') { self.line_no += 1; }
+        res
+    }
+}
+
+impl<'a> Iterator for Shlex<'a> {
+    type Item = Vec<u8>;
+    fn next(&mut self) -> Option<Self::Item> {
+        if let Some(mut ch) = self.next_char() {
+            // skip initial whitespace
+            loop {
+                match ch as char {
+                    ' ' | '\t' | '\n' => {},
+                    '#' => {
+                        while let Some(ch2) = self.next_char() {
+                            if ch2 as char == '\n' { break; }
+                        }
+                    },
+                    _ => { break; }
+                }
+                if let Some(ch2) = self.next_char() { ch = ch2; } else { return None; }
+            }
+            self.parse_word(ch)
+        } else { // no initial character
+            None
+        }
+    }
+
+}
+
+/// Convenience function that consumes the whole byte string at once.  Returns None if the input was
+/// erroneous.
+pub fn split(in_bytes: &[u8]) -> Option<Vec<Vec<u8>>> {
+    let mut shl = Shlex::new(in_bytes);
+    let res = shl.by_ref().collect();
+    if shl.had_error { None } else { Some(res) }
+}
+
+/// Given a single word, return a byte string suitable to encode it as a shell argument.
+///
+/// If given valid UTF-8, this will never produce invalid UTF-8. This is because it only
+/// ever inserts valid ASCII characters before or after existing ASCII characters (or
+/// returns two double quotes if the input was an empty string). It will never modify a
+/// multibyte UTF-8 character.
+pub fn quote(in_bytes: &[u8]) -> Cow<[u8]> {
+    if in_bytes.len() == 0 {
+        b"\"\""[..].into()
+    } else if in_bytes.iter().any(|c| match *c as char {
+        '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
+        '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
+        _ => false
+    }) {
+        let mut out: Vec<u8> = Vec::new();
+        out.push(b'"');
+        for &c in in_bytes {
+            match c {
+                b'$' | b'`' | b'"' | b'\\' => out.push(b'\\'),
+                _ => ()
+            }
+            out.push(c);
+        }
+        out.push(b'"');
+        out.into()
+    } else {
+        in_bytes.into()
+    }
+}
+
+/// Convenience function that consumes an iterable of words and turns it into a single byte string,
+/// quoting words when necessary. Consecutive words will be separated by a single space.
+pub fn join<'a, I: core::iter::IntoIterator<Item = &'a [u8]>>(words: I) -> Vec<u8> {
+    words.into_iter()
+        .map(quote)
+        .collect::<Vec<_>>()
+        .join(&b' ')
+}
+
+#[cfg(test)]
+const INVALID_UTF8: &[u8] = b"\xa1";
+
+#[test]
+fn test_invalid_utf8() {
+    // Check that our test string is actually invalid UTF-8.
+    assert!(core::str::from_utf8(INVALID_UTF8).is_err());
+}
+
+#[cfg(test)]
+static SPLIT_TEST_ITEMS: &'static [(&'static [u8], Option<&'static [&'static [u8]]>)] = &[
+    (b"foo$baz", Some(&[b"foo$baz"])),
+    (b"foo baz", Some(&[b"foo", b"baz"])),
+    (b"foo\"bar\"baz", Some(&[b"foobarbaz"])),
+    (b"foo \"bar\"baz", Some(&[b"foo", b"barbaz"])),
+    (b"   foo \nbar", Some(&[b"foo", b"bar"])),
+    (b"foo\\\nbar", Some(&[b"foobar"])),
+    (b"\"foo\\\nbar\"", Some(&[b"foobar"])),
+    (b"'baz\\$b'", Some(&[b"baz\\$b"])),
+    (b"'baz\\\''", None),
+    (b"\\", None),
+    (b"\"\\", None),
+    (b"'\\", None),
+    (b"\"", None),
+    (b"'", None),
+    (b"foo #bar\nbaz", Some(&[b"foo", b"baz"])),
+    (b"foo #bar", Some(&[b"foo"])),
+    (b"foo#bar", Some(&[b"foo#bar"])),
+    (b"foo\"#bar", None),
+    (b"'\\n'", Some(&[b"\\n"])),
+    (b"'\\\\n'", Some(&[b"\\\\n"])),
+    (INVALID_UTF8, Some(&[INVALID_UTF8])),
+];
+
+#[test]
+fn test_split() {
+    for &(input, output) in SPLIT_TEST_ITEMS {
+        assert_eq!(split(input), output.map(|o| o.iter().map(|&x| x.to_owned()).collect()));
+    }
+}
+
+#[test]
+fn test_lineno() {
+    let mut sh = Shlex::new(b"\nfoo\nbar");
+    while let Some(word) = sh.next() {
+        if word == b"bar" {
+            assert_eq!(sh.line_no, 3);
+        }
+    }
+}
+
+#[test]
+fn test_quote() {
+    assert_eq!(quote(b"foobar"), &b"foobar"[..]);
+    assert_eq!(quote(b"foo bar"), &b"\"foo bar\""[..]);
+    assert_eq!(quote(b"\""), &b"\"\\\"\""[..]);
+    assert_eq!(quote(b""), &b"\"\""[..]);
+    assert_eq!(quote(INVALID_UTF8), INVALID_UTF8);
+}
+
+#[test]
+fn test_join() {
+    assert_eq!(join(vec![]), &b""[..]);
+    assert_eq!(join(vec![&b""[..]]), &b"\"\""[..]);
+    assert_eq!(join(vec![&b"a"[..], &b"b"[..]]), &b"a b"[..]);
+    assert_eq!(join(vec![&b"foo bar"[..], &b"baz"[..]]), &b"\"foo bar\" baz"[..]);
+    assert_eq!(join(vec![INVALID_UTF8]), INVALID_UTF8);
+}
diff --git a/src/lib.rs b/src/lib.rs
@@ -29,6 +29,8 @@ use alloc::vec;
 #[cfg(test)]
 use alloc::borrow::ToOwned;
 
+pub mod bytes;
+
 /// An iterator that takes an input string and splits it into the words using the same syntax as
 /// the POSIX shell.
 pub struct Shlex<'a> {
@@ -159,26 +161,15 @@ pub fn split(in_str: &str) -> Option<Vec<String>> {
 
 /// Given a single word, return a string suitable to encode it as a shell argument.
 pub fn quote(in_str: &str) -> Cow<str> {
-    if in_str.len() == 0 {
-        "\"\"".into()
-    } else if in_str.bytes().any(|c| match c as char {
-        '|' | '&' | ';' | '<' | '>' | '(' | ')' | '$' | '`' | '\\' | '"' | '\'' | ' ' | '\t' |
-        '\r' | '\n' | '*' | '?' | '[' | '#' | '~' | '=' | '%' => true,
-        _ => false
-    }) {
-        let mut out: Vec<u8> = Vec::new();
-        out.push('"' as u8);
-        for c in in_str.bytes() {
-            match c as char {
-                '$' | '`' | '"' | '\\' => out.push('\\' as u8),
-                _ => ()
-            }
-            out.push(c);
+    match bytes::quote(in_str.as_bytes()) {
+        Cow::Borrowed(out) => {
+            // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
+            unsafe { core::str::from_utf8_unchecked(out) }.into()
+        }
+        Cow::Owned(out) => {
+            // Safety: given valid UTF-8, bytes::quote() will always return valid UTF-8.
+            unsafe { String::from_utf8_unchecked(out) }.into()
         }
-        out.push('"' as u8);
-        unsafe { String::from_utf8_unchecked(out) }.into()
-    } else {
-        in_str.into()
     }
 }