Skip to content

Commit

Permalink
Add handling of valid UTF-8 sequences to string quoting routine
Browse files Browse the repository at this point in the history
  • Loading branch information
hillu committed Sep 25, 2021
1 parent b05661d commit c1a88c8
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 23 deletions.
4 changes: 2 additions & 2 deletions json-format.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ More transformations will likely be added in the future.
- Most byte values that represent printable ASCII characters are reproduced as-is (but are subject to JSON string escaping rules).
- Bytes that map to non-printable ASCII characters (less than 32/0x20; 127/0x7f) are percent-encoded.
- Byte values that map to `%` (37/0x25) and `+` (42/0x2b) are percent-encoded.
- Byte values outside of the ASCII range (greater than 127/0x7f) are percent-encoded.
- Byte values outside of the ASCII range (greater than 127/0x7f) are reproduced as-is if they are part of a valid UTF-8 sequence. Otherwise, they are percent-encoded.

Handling of valid UTF-8 sequences will likely change in the future.
Handling of special Unicode characters may change in the future.

Rationale: The [JSON specification](https://datatracker.ietf.org/doc/html/rfc8259) mandates that "text exchanged between systems that are not part of a closed ecosystem MUST be encoded using UTF-8". JSON strings are comprised of Unicode character and thus cannot be used to represent arbitrary binary data. However, most values we think of as "strings" on Unix systems (processes, file names, command line arguments, environment variables) are, in reality, octet strings with varying restrictions. Being able to store those values without losing detail is important for log files that are used in a security context.

Expand Down
1 change: 1 addition & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use serde_json::{self,json};

pub mod types;
pub mod parser;
pub mod quoted_string;
pub mod coalesce;
pub mod proc;
pub mod rotate;
Expand Down
3 changes: 2 additions & 1 deletion src/proc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ use nix::sys::time::TimeSpec;
use serde::{Serialize,Serializer};
use serde::ser::{SerializeStruct};

use crate::types::{EventID,Record,Value,Number,ToQuotedString};
use crate::types::{EventID,Record,Value,Number};
use crate::quoted_string::ToQuotedString;

lazy_static! {
/// kernel clock ticks per second
Expand Down
89 changes: 89 additions & 0 deletions src/quoted_string.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
use std::str;

/// Format byte sequence as a string that is suitable for serializing
/// to the audit log
pub(crate) trait ToQuotedString {
fn to_quoted_string(&self) -> String;
}

impl ToQuotedString for [u8] {
fn to_quoted_string(self: &[u8]) -> String {
let mut sb = String::with_capacity(self.len());
let mut utf8state: Option<u8> = None;
let mut bytes = Vec::with_capacity(3);
for c in self {
loop {
match utf8state {
None => {
let l: u8 =
if *c >= 32 && *c < 127 && *c != b'%' && *c != b'+' {
sb.push(*c as char);
break;
} else if *c & 0b11100000 == 0b11000000 {
1
} else if *c & 0b11110000 == 0b11100000 {
2
} else if *c & 0b11111000 == 0b11110000 {
3
} else {
sb.push_str(&format!("%{:02x}", *c));
break;
};
bytes.clear();
bytes.push(*c);
utf8state = Some(l);
break;
},
Some(ref mut l) => {
if *c & 0b11000000 == 0b10000000 {
bytes.push(*c);
*l -= 1;
if *l == 0 {
match str::from_utf8(&bytes) {
Ok(s) => sb.push_str(s),
_ => bytes.iter().for_each(|c|sb.push_str(&format!("%{:02x}", c))),
}
utf8state = None;
}
break;
} else {
bytes.iter().for_each(|c|sb.push_str(&format!("%{:02x}", c)));
utf8state = None;
continue;
}
}
}
}
}
if let Some(_) = utf8state {
bytes.iter().for_each(|c|sb.push_str(&format!("%{:02x}", c)));
}
sb
}
}

#[cfg(test)]
mod test {
use super::ToQuotedString;
#[test]
fn to_quoted_string() {
assert_eq!(" ", b" ".to_quoted_string());
assert_eq!("asdf", b"asdf".to_quoted_string());
assert_eq!("%2b", b"+".to_quoted_string());
assert_eq!("%25", b"%".to_quoted_string());
assert_eq!("%2b%2b%2b", b"+++".to_quoted_string());
assert_eq!("%25%25%25", b"%%%".to_quoted_string());
assert_eq!("%25%2b%25", b"%+%".to_quoted_string());
assert_eq!("ä", b"\xc3\xa4".to_quoted_string());
assert_eq!("€", b"\xe2\x82\xac".to_quoted_string());
assert_eq!("💖", b"\xf0\x9f\x92\x96".to_quoted_string());
assert_eq!("äöü", b"\xc3\xa4\xc3\xb6\xc3\xbc".to_quoted_string());
assert_eq!("abcdäöüefgh", b"abcd\xc3\xa4\xc3\xb6\xc3\xbcefgh".to_quoted_string());
assert_eq!("🄻🄰🅄🅁🄴🄻", b"\xf0\x9f\x84\xbb\xf0\x9f\x84\xb0\xf0\x9f\x85\x84\xf0\x9f\x85\x81\xf0\x9f\x84\xb4\xf0\x9f\x84\xbb".to_quoted_string());
assert_eq!("%c3ä", b"\xc3\xc3\xa4".to_quoted_string());
assert_eq!("%f0💖", b"\xf0\xf0\x9f\x92\x96".to_quoted_string());
assert_eq!("%f0%9f💖", b"\xf0\x9f\xf0\x9f\x92\x96".to_quoted_string());
assert_eq!("%f0%9f%92💖", b"\xf0\x9f\x92\xf0\x9f\x92\x96".to_quoted_string());
// This will probably need some corner cases.
}
}
22 changes: 2 additions & 20 deletions src/types.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use serde::{Serialize,Serializer};
use serde::ser::{SerializeSeq,SerializeMap,Error};

use crate::constants::*;
use crate::quoted_string::ToQuotedString;

/// The identifier of an audit event, corresponding to the
/// `msg=audit(…)` part of every _auditd(8)_ log line.
Expand Down Expand Up @@ -365,26 +366,6 @@ impl<'a> Debug for RValue<'a> {
}
}

/// Format byte sequence as a string that is suitable for serializing
/// to the audit log
pub(crate) trait ToQuotedString {
fn to_quoted_string(&self) -> String;
}

impl ToQuotedString for [u8] {
fn to_quoted_string(self: &[u8]) -> String {
// FIXME Properly handle UTF-8
let mut sb = String::with_capacity(self.len());
for c in self {
if *c < 32 || *c == b'%' || *c == b'+' || *c >= 127 {
sb.push_str(&format!("%{:02x}", *c))
} else {
sb.push(*c as char)
}
}
sb
}
}

impl<'a> Serialize for RValue<'a> {
fn serialize<S: Serializer>(&self, s: S) -> Result<S::Ok,S::Error> {
Expand Down Expand Up @@ -448,3 +429,4 @@ impl Offset for Range<usize> {
Range{start: self.start + offset, end: self.end + offset }
}
}

0 comments on commit c1a88c8

Please sign in to comment.