From c1a88c8cd69b9d2477778e10e2386550d9533516 Mon Sep 17 00:00:00 2001 From: Hilko Bengen Date: Wed, 22 Sep 2021 13:20:40 +0200 Subject: [PATCH] Add handling of valid UTF-8 sequences to string quoting routine --- json-format.md | 4 +- src/main.rs | 1 + src/proc.rs | 3 +- src/quoted_string.rs | 89 ++++++++++++++++++++++++++++++++++++++++++++ src/types.rs | 22 +---------- 5 files changed, 96 insertions(+), 23 deletions(-) create mode 100644 src/quoted_string.rs diff --git a/json-format.md b/json-format.md index 7162b87..264ab93 100644 --- a/json-format.md +++ b/json-format.md @@ -24,9 +24,9 @@ More transformations will likely be added in the future. - Most byte values that represent printable ASCII characters are reproduced as-is (but are subject to JSON string escaping rules). - Bytes that map to non-printable ASCII characters (less than 32/0x20; 127/0x7f) are percent-encoded. - Byte values that map to `%` (37/0x25) and `+` (42/0x2b) are percent-encoded. -- Byte values outside of the ASCII range (greater than 127/0x7f) are percent-encoded. +- Byte values outside of the ASCII range (greater than 127/0x7f) are reproduced as-is if they are part of a valid UTF-8 sequence. Otherwise, they are percent-encoded. -Handling of valid UTF-8 sequences will likely change in the future. +Handling of special Unicode characters may change in the future. Rationale: The [JSON specification](https://datatracker.ietf.org/doc/html/rfc8259) mandates that "text exchanged between systems that are not part of a closed ecosystem MUST be encoded using UTF-8". JSON strings are comprised of Unicode character and thus cannot be used to represent arbitrary binary data. However, most values we think of as "strings" on Unix systems (processes, file names, command line arguments, environment variables) are, in reality, octet strings with varying restrictions. Being able to store those values without losing detail is important for log files that are used in a security context. diff --git a/src/main.rs b/src/main.rs index eab3f8c..d8f0afd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -21,6 +21,7 @@ use serde_json::{self,json}; pub mod types; pub mod parser; +pub mod quoted_string; pub mod coalesce; pub mod proc; pub mod rotate; diff --git a/src/proc.rs b/src/proc.rs index 4ebafd3..137a093 100644 --- a/src/proc.rs +++ b/src/proc.rs @@ -18,7 +18,8 @@ use nix::sys::time::TimeSpec; use serde::{Serialize,Serializer}; use serde::ser::{SerializeStruct}; -use crate::types::{EventID,Record,Value,Number,ToQuotedString}; +use crate::types::{EventID,Record,Value,Number}; +use crate::quoted_string::ToQuotedString; lazy_static! { /// kernel clock ticks per second diff --git a/src/quoted_string.rs b/src/quoted_string.rs new file mode 100644 index 0000000..6980d3d --- /dev/null +++ b/src/quoted_string.rs @@ -0,0 +1,89 @@ +use std::str; + +/// Format byte sequence as a string that is suitable for serializing +/// to the audit log +pub(crate) trait ToQuotedString { + fn to_quoted_string(&self) -> String; +} + +impl ToQuotedString for [u8] { + fn to_quoted_string(self: &[u8]) -> String { + let mut sb = String::with_capacity(self.len()); + let mut utf8state: Option = None; + let mut bytes = Vec::with_capacity(3); + for c in self { + loop { + match utf8state { + None => { + let l: u8 = + if *c >= 32 && *c < 127 && *c != b'%' && *c != b'+' { + sb.push(*c as char); + break; + } else if *c & 0b11100000 == 0b11000000 { + 1 + } else if *c & 0b11110000 == 0b11100000 { + 2 + } else if *c & 0b11111000 == 0b11110000 { + 3 + } else { + sb.push_str(&format!("%{:02x}", *c)); + break; + }; + bytes.clear(); + bytes.push(*c); + utf8state = Some(l); + break; + }, + Some(ref mut l) => { + if *c & 0b11000000 == 0b10000000 { + bytes.push(*c); + *l -= 1; + if *l == 0 { + match str::from_utf8(&bytes) { + Ok(s) => sb.push_str(s), + _ => bytes.iter().for_each(|c|sb.push_str(&format!("%{:02x}", c))), + } + utf8state = None; + } + break; + } else { + bytes.iter().for_each(|c|sb.push_str(&format!("%{:02x}", c))); + utf8state = None; + continue; + } + } + } + } + } + if let Some(_) = utf8state { + bytes.iter().for_each(|c|sb.push_str(&format!("%{:02x}", c))); + } + sb + } +} + +#[cfg(test)] +mod test { + use super::ToQuotedString; + #[test] + fn to_quoted_string() { + assert_eq!(" ", b" ".to_quoted_string()); + assert_eq!("asdf", b"asdf".to_quoted_string()); + assert_eq!("%2b", b"+".to_quoted_string()); + assert_eq!("%25", b"%".to_quoted_string()); + assert_eq!("%2b%2b%2b", b"+++".to_quoted_string()); + assert_eq!("%25%25%25", b"%%%".to_quoted_string()); + assert_eq!("%25%2b%25", b"%+%".to_quoted_string()); + assert_eq!("ä", b"\xc3\xa4".to_quoted_string()); + assert_eq!("€", b"\xe2\x82\xac".to_quoted_string()); + assert_eq!("💖", b"\xf0\x9f\x92\x96".to_quoted_string()); + assert_eq!("äöü", b"\xc3\xa4\xc3\xb6\xc3\xbc".to_quoted_string()); + assert_eq!("abcdäöüefgh", b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh".to_quoted_string()); + assert_eq!("🄻🄰🅄🅁🄴🄻", b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb".to_quoted_string()); + assert_eq!("%c3ä", b"\xc3\xc3\xa4".to_quoted_string()); + assert_eq!("%f0💖", b"\xf0\xf0\x9f\x92\x96".to_quoted_string()); + assert_eq!("%f0%9f💖", b"\xf0\x9f\xf0\x9f\x92\x96".to_quoted_string()); + assert_eq!("%f0%9f%92💖", b"\xf0\x9f\x92\xf0\x9f\x92\x96".to_quoted_string()); + // This will probably need some corner cases. + } +} diff --git a/src/types.rs b/src/types.rs index 236247e..688284a 100644 --- a/src/types.rs +++ b/src/types.rs @@ -10,6 +10,7 @@ use serde::{Serialize,Serializer}; use serde::ser::{SerializeSeq,SerializeMap,Error}; use crate::constants::*; +use crate::quoted_string::ToQuotedString; /// The identifier of an audit event, corresponding to the /// `msg=audit(…)` part of every _auditd(8)_ log line. @@ -365,26 +366,6 @@ impl<'a> Debug for RValue<'a> { } } -/// Format byte sequence as a string that is suitable for serializing -/// to the audit log -pub(crate) trait ToQuotedString { - fn to_quoted_string(&self) -> String; -} - -impl ToQuotedString for [u8] { - fn to_quoted_string(self: &[u8]) -> String { - // FIXME Properly handle UTF-8 - let mut sb = String::with_capacity(self.len()); - for c in self { - if *c < 32 || *c == b'%' || *c == b'+' || *c >= 127 { - sb.push_str(&format!("%{:02x}", *c)) - } else { - sb.push(*c as char) - } - } - sb - } -} impl<'a> Serialize for RValue<'a> { fn serialize(&self, s: S) -> Result { @@ -448,3 +429,4 @@ impl Offset for Range { Range{start: self.start + offset, end: self.end + offset } } } +