From a199a3b19ae58edcd8b757dbe0dcfc6fa2dfded8 Mon Sep 17 00:00:00 2001 From: "Neal H. Walfield" Date: Sun, 23 Jun 2019 01:11:44 +0200 Subject: [PATCH 1/7] Add a streamer reader --- src/lib.rs | 1 + src/read/decoder.rs | 133 ++++++++++++++++++++++++++++++++++++++ src/read/decoder_tests.rs | 129 ++++++++++++++++++++++++++++++++++++ src/read/mod.rs | 6 ++ 4 files changed, 269 insertions(+) create mode 100644 src/read/decoder.rs create mode 100644 src/read/decoder_tests.rs create mode 100644 src/read/mod.rs diff --git a/src/lib.rs b/src/lib.rs index d6fd3142..16dd6fbc 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -74,6 +74,7 @@ mod chunked_encoder; pub mod display; mod tables; pub mod write; +pub mod read; mod encode; pub use encode::{encode, encode_config, encode_config_buf, encode_config_slice}; diff --git a/src/read/decoder.rs b/src/read/decoder.rs new file mode 100644 index 00000000..136b30f6 --- /dev/null +++ b/src/read/decoder.rs @@ -0,0 +1,133 @@ +use std::io::{Error, ErrorKind, Result, Read}; +use std::fmt; +use std::cmp; +use {decode_config_slice, Config}; + +// This should be large, but it has to fit on the stack. +pub(crate) const BUF_SIZE: usize = 1024; + +// 4 bytes of base64 data encode 3 bytes of raw data (modulo padding). +const BASE64_CHUNK_SIZE : usize = 4; +const RAW_CHUNK_SIZE : usize = 3; + +/// A `Read` implementation that decodes base64 data read from an underlying reader. +/// +/// # Examples +/// +/// ``` +/// use std::io::Read; +/// use std::io::Cursor; +/// +/// // use a cursor as the simplest possible `Read` -- in real code this is probably a file, etc. +/// let mut wrapped_reader = Cursor::new(b"YXNkZg=="); +/// let mut decoder = base64::read::DecoderReader::new( +/// &mut wrapped_reader, base64::STANDARD); +/// +/// // handle errors as you normally would +/// let mut result = Vec::new(); +/// decoder.read_to_end(&mut result).unwrap(); +/// +/// assert_eq!(b"asdf", &result[..]); +/// +/// ``` +pub struct DecoderReader<'a, R: 'a + Read> { + config: Config, + /// Where encoded data is read from + r: &'a mut R, + + // The maximum of decoded base64 data that we'll buffer at once. + buffer: [u8; BUF_SIZE], + // The start of the unreturned buffered data. + buffer_offset: usize, + // The amount of buffered data. + buffer_amount: usize, +} + +impl<'a, R: Read> fmt::Debug for DecoderReader<'a, R> { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("DecoderReader") + .field("config", &self.config) + .field("buffer_offset", &self.buffer_offset) + .field("buffer_amount", &self.buffer_amount) + .finish() + } +} + +impl<'a, R: Read> DecoderReader<'a, R> { + /// Create a new decoder that will read from the provided reader `r`. + pub fn new(r: &'a mut R, config: Config) -> Self { + DecoderReader { + config, + r, + buffer: [ 0; BUF_SIZE ], + buffer_offset: 0, + buffer_amount: 0, + } + } +} + +impl<'a, R: Read> Read for DecoderReader<'a, R> { + /// Decode input from the wrapped reader. + /// + /// Under non-error circumstances, this returns `Ok` with the value being the number of bytes + /// returned in `buf`. + /// + /// If the caller performs a short read, then this function reads in a large chunk of data, + /// decodes that, and buffers the result. The intent is to amortize the decoding cost + /// of many small reads. + /// + /// # Errors + /// + /// Any errors emitted by the delegate reader are returned. + fn read(&mut self, buf: &mut [u8]) -> Result { + // Check some invariants. + assert!(self.buffer_offset < BUF_SIZE); + assert!(self.buffer_amount <= BUF_SIZE); + assert!(self.buffer_offset + self.buffer_amount <= BUF_SIZE); + + if self.buffer_amount > 0 { + // We have something buffered, use that. + + let amount = cmp::min(buf.len(), self.buffer_amount); + buf[..amount].copy_from_slice(&self.buffer[self.buffer_offset..self.buffer_offset+amount]); + self.buffer_offset += amount; + self.buffer_amount -= amount; + + Ok(amount) + } else if buf.len() >= 2 * RAW_CHUNK_SIZE { + // The caller wants at least two chunks. Round down to a + // multiple of the chunk size and decode directly into the + // caller-provided buffer. + + let mut base64_data = [0u8; BUF_SIZE]; + let base64_bytes = cmp::min(BUF_SIZE, (buf.len() / RAW_CHUNK_SIZE) * BASE64_CHUNK_SIZE); + assert!(base64_bytes > 0); + + let read = self.r.read(&mut base64_data[..base64_bytes])?; + + let decoded = decode_config_slice(&base64_data[..read], self.config, buf) + .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; + + Ok(decoded) + } else { + // The caller wants less than a chunk of decoded data + // (i.e., one or two bytes). We have to buffer something. + // Double buffer a large amount in case short reads turn + // out to be common. + + let mut base64_data = [0u8; BUF_SIZE]; + let read = self.r.read(&mut base64_data)?; + + let decoded = decode_config_slice(&base64_data[..read], self.config, &mut self.buffer) + .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; + + let returning = cmp::min(buf.len(), decoded); + buf[..returning].copy_from_slice(&self.buffer[..returning]); + + self.buffer_offset = returning; + self.buffer_amount = decoded - returning; + + Ok(returning) + } + } +} diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs new file mode 100644 index 00000000..7b8477dd --- /dev/null +++ b/src/read/decoder_tests.rs @@ -0,0 +1,129 @@ +extern crate rand; + +use std::io::Read; + +use std::io::Cursor; +use self::rand::Rng; + +use encode::encode_config_buf; +use tests::random_config; +use read::decoder::BUF_SIZE; + +#[test] +fn simple() { + let tests : &[(&[u8], &[u8])] = &[ + ( &b"0"[..], &b"MA=="[..] ), + ( b"01", b"MDE=" ), + ( b"012", b"MDEy" ), + ( b"0123", b"MDEyMw==" ), + ( b"01234", b"MDEyMzQ=" ), + ( b"012345", b"MDEyMzQ1" ), + ( b"0123456", b"MDEyMzQ1Ng==" ), + ( b"01234567", b"MDEyMzQ1Njc=" ), + ( b"012345678", b"MDEyMzQ1Njc4" ), + ( b"0123456789", b"MDEyMzQ1Njc4OQ==" ), + ][..]; + + for (text_expected, base64data) in tests.iter() { + // Read n bytes at a time. + for n in 1..base64data.len() + 1 { + let mut wrapped_reader = Cursor::new(base64data); + let mut decoder = ::read::DecoderReader::new( + &mut wrapped_reader, ::STANDARD); + + // handle errors as you normally would + let mut text_got = Vec::new(); + let mut buffer = vec![ 0u8; n ]; + while let Ok(read) = decoder.read(&mut buffer[..]) { + if read == 0 { + break; + } + text_got.extend_from_slice(&buffer[..read]); + } + + assert_eq!(text_got, *text_expected, + "\nGot: {}\nExpected: {}", + String::from_utf8_lossy(&text_got[..]), + String::from_utf8_lossy(text_expected)); + } + } +} + +#[test] +fn big() { + let mut rng = rand::thread_rng(); + + for _ in 0..100 { + // The size. Not even multiples of BUF_SIZE. + let size = rng.gen_range(0, 10 * BUF_SIZE); + + // Create the random content. + let mut data_orig = Vec::with_capacity(size); + for _ in 0..size { + data_orig.push(rng.gen()); + } + + let config = random_config(&mut rng); + + // Encode it. + let mut encoded = String::new(); + encode_config_buf(&data_orig, config, &mut encoded); + + // Amount to read at a time: small, medium, large and XL. + for &n in &[ rng.gen_range(1, 10), + rng.gen_range(1, 100), + rng.gen_range(1, BUF_SIZE), + BUF_SIZE + 1 ] + { + let mut wrapped_reader = Cursor::new(encoded.clone()); + let mut decoder = ::read::DecoderReader::new( + &mut wrapped_reader, config); + + let mut data_got = Vec::new(); + let mut buffer = vec![ 0u8; n ]; + while let Ok(read) = decoder.read(&mut buffer[..]) { + if read == 0 { + break; + } + data_got.extend_from_slice(&buffer[..read]); + } + + if data_got != data_orig { + panic!("\nGot: {}\nExpected: {}", + String::from_utf8_lossy(&data_got[..]), + String::from_utf8_lossy(&data_orig[..])); + } + } + } +} + +// Make sure we error out on trailing junk. +#[test] +fn trailing_junk() { + let tests : &[&[u8]] = &[ + &b"MDEyMzQ1Njc4*!@#$%^&"[..], + b"MDEyMzQ1Njc4OQ== ", + ][..]; + + for base64data in tests.iter() { + // Read n bytes at a time. + for n in 1..base64data.len() + 1 { + let mut wrapped_reader = Cursor::new(base64data); + let mut decoder = ::read::DecoderReader::new( + &mut wrapped_reader, ::STANDARD); + + // handle errors as you normally would + let mut buffer = vec![ 0u8; n ]; + let mut saw_error = false; + loop { + match decoder.read(&mut buffer[..]) { + Err(_) => saw_error = true, + Ok(read) if read == 0 => break, + Ok(_) => (), + } + } + + assert!(saw_error); + } + } +} diff --git a/src/read/mod.rs b/src/read/mod.rs new file mode 100644 index 00000000..85606448 --- /dev/null +++ b/src/read/mod.rs @@ -0,0 +1,6 @@ +//! Implementations of `io::Read` to transparently decode base64. +mod decoder; +pub use self::decoder::DecoderReader; + +#[cfg(test)] +mod decoder_tests; From 0f08888ca4d4ef0b6197337a3de1460ccf875a6a Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Tue, 19 Nov 2019 21:25:09 -0700 Subject: [PATCH 2/7] Apply rustfmt --- src/lib.rs | 2 +- src/read/decoder.rs | 13 +++---- src/read/decoder_tests.rs | 76 +++++++++++++++++++-------------------- 3 files changed, 46 insertions(+), 45 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index d192afb1..e74e6bd8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,10 +77,10 @@ doctest!("../README.md"); mod chunked_encoder; pub mod display; +pub mod read; mod tables; #[cfg(any(feature = "std", test))] pub mod write; -pub mod read; mod encode; pub use crate::encode::encode_config_slice; diff --git a/src/read/decoder.rs b/src/read/decoder.rs index 136b30f6..3621b3c3 100644 --- a/src/read/decoder.rs +++ b/src/read/decoder.rs @@ -1,14 +1,14 @@ -use std::io::{Error, ErrorKind, Result, Read}; -use std::fmt; use std::cmp; +use std::fmt; +use std::io::{Error, ErrorKind, Read, Result}; use {decode_config_slice, Config}; // This should be large, but it has to fit on the stack. pub(crate) const BUF_SIZE: usize = 1024; // 4 bytes of base64 data encode 3 bytes of raw data (modulo padding). -const BASE64_CHUNK_SIZE : usize = 4; -const RAW_CHUNK_SIZE : usize = 3; +const BASE64_CHUNK_SIZE: usize = 4; +const RAW_CHUNK_SIZE: usize = 3; /// A `Read` implementation that decodes base64 data read from an underlying reader. /// @@ -59,7 +59,7 @@ impl<'a, R: Read> DecoderReader<'a, R> { DecoderReader { config, r, - buffer: [ 0; BUF_SIZE ], + buffer: [0; BUF_SIZE], buffer_offset: 0, buffer_amount: 0, } @@ -89,7 +89,8 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { // We have something buffered, use that. let amount = cmp::min(buf.len(), self.buffer_amount); - buf[..amount].copy_from_slice(&self.buffer[self.buffer_offset..self.buffer_offset+amount]); + buf[..amount] + .copy_from_slice(&self.buffer[self.buffer_offset..self.buffer_offset + amount]); self.buffer_offset += amount; self.buffer_amount -= amount; diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs index 7b8477dd..f6442315 100644 --- a/src/read/decoder_tests.rs +++ b/src/read/decoder_tests.rs @@ -2,38 +2,37 @@ extern crate rand; use std::io::Read; -use std::io::Cursor; use self::rand::Rng; +use std::io::Cursor; use encode::encode_config_buf; -use tests::random_config; use read::decoder::BUF_SIZE; +use tests::random_config; #[test] fn simple() { - let tests : &[(&[u8], &[u8])] = &[ - ( &b"0"[..], &b"MA=="[..] ), - ( b"01", b"MDE=" ), - ( b"012", b"MDEy" ), - ( b"0123", b"MDEyMw==" ), - ( b"01234", b"MDEyMzQ=" ), - ( b"012345", b"MDEyMzQ1" ), - ( b"0123456", b"MDEyMzQ1Ng==" ), - ( b"01234567", b"MDEyMzQ1Njc=" ), - ( b"012345678", b"MDEyMzQ1Njc4" ), - ( b"0123456789", b"MDEyMzQ1Njc4OQ==" ), + let tests: &[(&[u8], &[u8])] = &[ + (&b"0"[..], &b"MA=="[..]), + (b"01", b"MDE="), + (b"012", b"MDEy"), + (b"0123", b"MDEyMw=="), + (b"01234", b"MDEyMzQ="), + (b"012345", b"MDEyMzQ1"), + (b"0123456", b"MDEyMzQ1Ng=="), + (b"01234567", b"MDEyMzQ1Njc="), + (b"012345678", b"MDEyMzQ1Njc4"), + (b"0123456789", b"MDEyMzQ1Njc4OQ=="), ][..]; for (text_expected, base64data) in tests.iter() { // Read n bytes at a time. for n in 1..base64data.len() + 1 { let mut wrapped_reader = Cursor::new(base64data); - let mut decoder = ::read::DecoderReader::new( - &mut wrapped_reader, ::STANDARD); + let mut decoder = ::read::DecoderReader::new(&mut wrapped_reader, ::STANDARD); // handle errors as you normally would let mut text_got = Vec::new(); - let mut buffer = vec![ 0u8; n ]; + let mut buffer = vec![0u8; n]; while let Ok(read) = decoder.read(&mut buffer[..]) { if read == 0 { break; @@ -41,10 +40,13 @@ fn simple() { text_got.extend_from_slice(&buffer[..read]); } - assert_eq!(text_got, *text_expected, - "\nGot: {}\nExpected: {}", - String::from_utf8_lossy(&text_got[..]), - String::from_utf8_lossy(text_expected)); + assert_eq!( + text_got, + *text_expected, + "\nGot: {}\nExpected: {}", + String::from_utf8_lossy(&text_got[..]), + String::from_utf8_lossy(text_expected) + ); } } } @@ -70,17 +72,17 @@ fn big() { encode_config_buf(&data_orig, config, &mut encoded); // Amount to read at a time: small, medium, large and XL. - for &n in &[ rng.gen_range(1, 10), - rng.gen_range(1, 100), - rng.gen_range(1, BUF_SIZE), - BUF_SIZE + 1 ] - { + for &n in &[ + rng.gen_range(1, 10), + rng.gen_range(1, 100), + rng.gen_range(1, BUF_SIZE), + BUF_SIZE + 1, + ] { let mut wrapped_reader = Cursor::new(encoded.clone()); - let mut decoder = ::read::DecoderReader::new( - &mut wrapped_reader, config); + let mut decoder = ::read::DecoderReader::new(&mut wrapped_reader, config); let mut data_got = Vec::new(); - let mut buffer = vec![ 0u8; n ]; + let mut buffer = vec![0u8; n]; while let Ok(read) = decoder.read(&mut buffer[..]) { if read == 0 { break; @@ -89,9 +91,11 @@ fn big() { } if data_got != data_orig { - panic!("\nGot: {}\nExpected: {}", - String::from_utf8_lossy(&data_got[..]), - String::from_utf8_lossy(&data_orig[..])); + panic!( + "\nGot: {}\nExpected: {}", + String::from_utf8_lossy(&data_got[..]), + String::from_utf8_lossy(&data_orig[..]) + ); } } } @@ -100,20 +104,16 @@ fn big() { // Make sure we error out on trailing junk. #[test] fn trailing_junk() { - let tests : &[&[u8]] = &[ - &b"MDEyMzQ1Njc4*!@#$%^&"[..], - b"MDEyMzQ1Njc4OQ== ", - ][..]; + let tests: &[&[u8]] = &[&b"MDEyMzQ1Njc4*!@#$%^&"[..], b"MDEyMzQ1Njc4OQ== "][..]; for base64data in tests.iter() { // Read n bytes at a time. for n in 1..base64data.len() + 1 { let mut wrapped_reader = Cursor::new(base64data); - let mut decoder = ::read::DecoderReader::new( - &mut wrapped_reader, ::STANDARD); + let mut decoder = ::read::DecoderReader::new(&mut wrapped_reader, ::STANDARD); // handle errors as you normally would - let mut buffer = vec![ 0u8; n ]; + let mut buffer = vec![0u8; n]; let mut saw_error = false; loop { match decoder.read(&mut buffer[..]) { From 3b755626b093a1bfe3558af5ebc489b2ddd0e1c0 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Tue, 19 Nov 2019 21:32:11 -0700 Subject: [PATCH 3/7] Get it buildilng with current rust --- src/lib.rs | 1 + src/read/decoder.rs | 2 +- src/read/decoder_tests.rs | 17 ++++++++--------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index e74e6bd8..34117b79 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -77,6 +77,7 @@ doctest!("../README.md"); mod chunked_encoder; pub mod display; +#[cfg(any(feature = "std", test))] pub mod read; mod tables; #[cfg(any(feature = "std", test))] diff --git a/src/read/decoder.rs b/src/read/decoder.rs index 3621b3c3..b3475ed6 100644 --- a/src/read/decoder.rs +++ b/src/read/decoder.rs @@ -1,7 +1,7 @@ use std::cmp; use std::fmt; use std::io::{Error, ErrorKind, Read, Result}; -use {decode_config_slice, Config}; +use crate::{decode_config_slice, Config}; // This should be large, but it has to fit on the stack. pub(crate) const BUF_SIZE: usize = 1024; diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs index f6442315..78eb3bb0 100644 --- a/src/read/decoder_tests.rs +++ b/src/read/decoder_tests.rs @@ -1,13 +1,12 @@ -extern crate rand; - use std::io::Read; -use self::rand::Rng; +use rand::Rng; use std::io::Cursor; -use encode::encode_config_buf; -use read::decoder::BUF_SIZE; -use tests::random_config; +use crate::STANDARD; +use crate::encode::encode_config_buf; +use crate::tests::random_config; +use super::decoder::{BUF_SIZE, DecoderReader}; #[test] fn simple() { @@ -28,7 +27,7 @@ fn simple() { // Read n bytes at a time. for n in 1..base64data.len() + 1 { let mut wrapped_reader = Cursor::new(base64data); - let mut decoder = ::read::DecoderReader::new(&mut wrapped_reader, ::STANDARD); + let mut decoder = DecoderReader::new(&mut wrapped_reader, STANDARD); // handle errors as you normally would let mut text_got = Vec::new(); @@ -79,7 +78,7 @@ fn big() { BUF_SIZE + 1, ] { let mut wrapped_reader = Cursor::new(encoded.clone()); - let mut decoder = ::read::DecoderReader::new(&mut wrapped_reader, config); + let mut decoder = DecoderReader::new(&mut wrapped_reader, config); let mut data_got = Vec::new(); let mut buffer = vec![0u8; n]; @@ -110,7 +109,7 @@ fn trailing_junk() { // Read n bytes at a time. for n in 1..base64data.len() + 1 { let mut wrapped_reader = Cursor::new(base64data); - let mut decoder = ::read::DecoderReader::new(&mut wrapped_reader, ::STANDARD); + let mut decoder = DecoderReader::new(&mut wrapped_reader, STANDARD); // handle errors as you normally would let mut buffer = vec![0u8; n]; From fb3687515dba9ae47294f657891031cb75888e6d Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Tue, 19 Nov 2019 22:43:24 -0700 Subject: [PATCH 4/7] Test showing short read problem --- src/read/decoder.rs | 24 +++++++++----- src/read/decoder_tests.rs | 67 ++++++++++++++++++++++++++++++++++----- 2 files changed, 75 insertions(+), 16 deletions(-) diff --git a/src/read/decoder.rs b/src/read/decoder.rs index b3475ed6..1188381f 100644 --- a/src/read/decoder.rs +++ b/src/read/decoder.rs @@ -1,14 +1,14 @@ +use crate::{decode_config_slice, Config}; use std::cmp; use std::fmt; use std::io::{Error, ErrorKind, Read, Result}; -use crate::{decode_config_slice, Config}; // This should be large, but it has to fit on the stack. pub(crate) const BUF_SIZE: usize = 1024; // 4 bytes of base64 data encode 3 bytes of raw data (modulo padding). const BASE64_CHUNK_SIZE: usize = 4; -const RAW_CHUNK_SIZE: usize = 3; +const DECODED_CHUNK_SIZE: usize = 3; /// A `Read` implementation that decodes base64 data read from an underlying reader. /// @@ -95,19 +95,24 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { self.buffer_amount -= amount; Ok(amount) - } else if buf.len() >= 2 * RAW_CHUNK_SIZE { + } else if buf.len() >= 2 * DECODED_CHUNK_SIZE { // The caller wants at least two chunks. Round down to a // multiple of the chunk size and decode directly into the // caller-provided buffer. - - let mut base64_data = [0u8; BUF_SIZE]; - let base64_bytes = cmp::min(BUF_SIZE, (buf.len() / RAW_CHUNK_SIZE) * BASE64_CHUNK_SIZE); + let base64_bytes = cmp::min( + BUF_SIZE, + (buf.len() / DECODED_CHUNK_SIZE) * BASE64_CHUNK_SIZE, + ); assert!(base64_bytes > 0); - let read = self.r.read(&mut base64_data[..base64_bytes])?; + // TODO what if read provides less data than we asked for? + // borrow our buffer since it has nothing of value in it + let read = self.r.read(&mut self.buffer[..base64_bytes])?; - let decoded = decode_config_slice(&base64_data[..read], self.config, buf) + // TODO only decode complete chunks + let decoded = decode_config_slice(&self.buffer[..read], self.config, buf) .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; + // TODO keep track of any data we didn't decode (i.e. incomplete chunks) Ok(decoded) } else { @@ -116,6 +121,9 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { // Double buffer a large amount in case short reads turn // out to be common. + // TODO maybe store base64 in the buffer, and keep a separate decode buffer that holds + // only a decoded chunk (3 bytes) to handle very short read calls? + let mut base64_data = [0u8; BUF_SIZE]; let read = self.r.read(&mut base64_data)?; diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs index 78eb3bb0..900358e4 100644 --- a/src/read/decoder_tests.rs +++ b/src/read/decoder_tests.rs @@ -1,12 +1,12 @@ -use std::io::Read; +use std::io::{self, Read}; -use rand::Rng; -use std::io::Cursor; +use rand::{Rng, RngCore}; +use std::iter; -use crate::STANDARD; +use super::decoder::{DecoderReader, BUF_SIZE}; use crate::encode::encode_config_buf; use crate::tests::random_config; -use super::decoder::{BUF_SIZE, DecoderReader}; +use crate::STANDARD; #[test] fn simple() { @@ -26,7 +26,7 @@ fn simple() { for (text_expected, base64data) in tests.iter() { // Read n bytes at a time. for n in 1..base64data.len() + 1 { - let mut wrapped_reader = Cursor::new(base64data); + let mut wrapped_reader = io::Cursor::new(base64data); let mut decoder = DecoderReader::new(&mut wrapped_reader, STANDARD); // handle errors as you normally would @@ -77,7 +77,7 @@ fn big() { rng.gen_range(1, BUF_SIZE), BUF_SIZE + 1, ] { - let mut wrapped_reader = Cursor::new(encoded.clone()); + let mut wrapped_reader = io::Cursor::new(encoded.clone()); let mut decoder = DecoderReader::new(&mut wrapped_reader, config); let mut data_got = Vec::new(); @@ -108,7 +108,7 @@ fn trailing_junk() { for base64data in tests.iter() { // Read n bytes at a time. for n in 1..base64data.len() + 1 { - let mut wrapped_reader = Cursor::new(base64data); + let mut wrapped_reader = io::Cursor::new(base64data); let mut decoder = DecoderReader::new(&mut wrapped_reader, STANDARD); // handle errors as you normally would @@ -126,3 +126,54 @@ fn trailing_junk() { } } } + +#[test] +fn handles_short_read() { + let mut rng = rand::thread_rng(); + let mut bytes = Vec::new(); + let mut b64 = String::new(); + let mut decoded = Vec::new(); + + for _ in 0..10_000 { + bytes.clear(); + b64.clear(); + decoded.clear(); + + let size = rng.gen_range(0, 10 * BUF_SIZE); + bytes.extend(iter::repeat(0).take(size)); + bytes.truncate(size); + rng.fill_bytes(&mut bytes[..size]); + assert_eq!(size, bytes.len()); + + let config = random_config(&mut rng); + encode_config_buf(&bytes[..], config, &mut b64); + + let mut wrapped_reader = io::Cursor::new(b64.as_bytes()); + let mut short_reader = RandomShortRead { + delegate: &mut wrapped_reader, + rng: &mut rng, + }; + + let mut decoder = DecoderReader::new(&mut short_reader, config); + + let decoded_len = decoder.read_to_end(&mut decoded).unwrap(); + assert_eq!(size, decoded_len); + assert_eq!(&bytes[..], &decoded[..]); + + } +} + +// exercise code that depends on +struct RandomShortRead<'a, 'b, R: io::Read, N: rand::Rng> { + delegate: &'b mut R, + rng: &'a mut N, +} + +impl<'a, 'b, R: io::Read, N: rand::Rng> io::Read for RandomShortRead<'a, 'b, R, N> { + fn read(&mut self, buf: &mut [u8]) -> Result { + let effective_len = self.rng.gen_range(0, 20); + println!("buf {} effective {}", buf.len(), effective_len); + + self.delegate.read(&mut buf[..effective_len]) + } +} From 12dcfb8d731d9d0c31f97f849f43ea97f7338b6d Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Sun, 24 Nov 2019 13:30:14 -0700 Subject: [PATCH 5/7] Re-implement DecoderReader. Turns out the initial, simple implementation didn't handle cases where slow or otherwise unpredictable delegate `Read`ers wouldn't provide as many bytes as we asked for, which meant there has to be a lot more bookkeeping. Also, only one nontrivial buffer is needed now. --- README.md | 2 +- RELEASE-NOTES.md | 4 +- benches/benchmarks.rs | 21 ++- src/lib.rs | 8 ++ src/read/decoder.rs | 280 ++++++++++++++++++++++++++++---------- src/read/decoder_tests.rs | 221 ++++++++++++++++++++++-------- 6 files changed, 401 insertions(+), 135 deletions(-) diff --git a/README.md b/README.md index 19f2feb2..7815966c 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@ Made with CLion. Thanks to JetBrains for supporting open source! It's base64. What more could anyone want? -This library's goals are to be *correct* and *fast*. It's thoroughly tested and widely used. It exposes functionality at multiple levels of abstraction so you can choose the level of convenience vs performance that you want, e.g. `decode_config_slice` decodes into an existing `&mut [u8]` and is pretty fast (2.6GiB/s for a 3 KiB input), whereas `decode_config` allocates a new `Vec` and returns it, which might be more convenient in some cases, but is slower (although still fast enough for most purposes) at 2.1 GiB/s. +This library's goals are to be *correct* and *fast*. It's thoroughly tested and widely used. It exposes functionality at multiple levels of abstraction so you can choose the level of convenience vs performance that you want, e.g. `decode_config_slice` decodes into an existing `&mut [u8]` and is pretty fast (2.6GiB/s for a 3 KiB input), whereas `decode_config` allocates a new `Vec` and returns it, which might be more convenient in some cases, but is slower (although still fast enough for almost any purpose) at 2.1 GiB/s. Example --- diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md index 1a730dca..0ef5c035 100644 --- a/RELEASE-NOTES.md +++ b/RELEASE-NOTES.md @@ -1,6 +1,8 @@ # Next -- 0.11.0 +- A `Read` implementation (`DecoderReader`) to let users transparently decoded data from a b64 input source + +# 0.11.0 - Minimum rust version 1.34.0 - `no_std` is now supported via the two new features `alloc` and `std`. diff --git a/benches/benchmarks.rs b/benches/benchmarks.rs index 475dfdb7..07f88721 100644 --- a/benches/benchmarks.rs +++ b/benches/benchmarks.rs @@ -11,7 +11,7 @@ use base64::{ use criterion::{black_box, Bencher, Criterion, ParameterizedBenchmark, Throughput}; use rand::{FromEntropy, Rng}; -use std::io::Write; +use std::io::{self, Read, Write}; const TEST_CONFIG: Config = base64::STANDARD; @@ -52,6 +52,24 @@ fn do_decode_bench_slice(b: &mut Bencher, &size: &usize) { }); } +fn do_decode_bench_stream(b: &mut Bencher, &size: &usize) { + let mut v: Vec = Vec::with_capacity(size * 3 / 4); + fill(&mut v); + let encoded = encode(&v); + + let mut buf = Vec::new(); + buf.resize(size, 0); + buf.truncate(0); + + b.iter(|| { + let mut cursor = io::Cursor::new(&encoded[..]); + let mut decoder = base64::read::DecoderReader::new(&mut cursor, TEST_CONFIG); + decoder.read_to_end(&mut buf).unwrap(); + buf.clear(); + black_box(&buf); + }); +} + fn do_encode_bench(b: &mut Bencher, &size: &usize) { let mut v: Vec = Vec::with_capacity(size); fill(&mut v); @@ -138,6 +156,7 @@ fn decode_benchmarks(byte_sizes: &[usize]) -> ParameterizedBenchmark { .throughput(|s| Throughput::Bytes(*s as u64)) .with_function("decode_reuse_buf", do_decode_bench_reuse_buf) .with_function("decode_slice", do_decode_bench_slice) + .with_function("decode_stream", do_decode_bench_stream) } fn bench(c: &mut Criterion) { diff --git a/src/lib.rs b/src/lib.rs index 34117b79..d0000dc5 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,6 +43,14 @@ //! Input can be invalid because it has invalid characters or invalid padding. (No padding at all is //! valid, but excess padding is not.) Whitespace in the input is invalid. //! +//! # `Read` and `Write` +//! +//! To map a `Read` of b64 bytes to the decoded bytes, wrap a reader (file, network socket, etc) +//! with `base64::read::DecoderReader`. To write raw bytes and have them b64 encoded on the fly, +//! wrap a writer with `base64::write::EncoderWriter`. There is some performance overhead (15% or +//! so) because of the necessary buffer shuffling -- still fast enough that almost nobody cares. +//! Also, these implementations do not heap allocate. +//! //! # Panics //! //! If length calculations result in overflowing `usize`, a panic will result. diff --git a/src/read/decoder.rs b/src/read/decoder.rs index 1188381f..9fa28cd7 100644 --- a/src/read/decoder.rs +++ b/src/read/decoder.rs @@ -1,7 +1,6 @@ -use crate::{decode_config_slice, Config}; -use std::cmp; -use std::fmt; -use std::io::{Error, ErrorKind, Read, Result}; +use crate::{decode_config_slice, Config, DecodeError}; +use std::io::Read; +use std::{cmp, fmt, io}; // This should be large, but it has to fit on the stack. pub(crate) const BUF_SIZE: usize = 1024; @@ -30,113 +29,244 @@ const DECODED_CHUNK_SIZE: usize = 3; /// assert_eq!(b"asdf", &result[..]); /// /// ``` -pub struct DecoderReader<'a, R: 'a + Read> { +pub struct DecoderReader<'a, R: 'a + io::Read> { config: Config, - /// Where encoded data is read from + /// Where b64 data is read from r: &'a mut R, - // The maximum of decoded base64 data that we'll buffer at once. - buffer: [u8; BUF_SIZE], - // The start of the unreturned buffered data. - buffer_offset: usize, - // The amount of buffered data. - buffer_amount: usize, + // Holds b64 data read from the delegate reader. + b64_buffer: [u8; BUF_SIZE], + // The start of the pending buffered data in b64_buffer. + b64_offset: usize, + // The amount of buffered b64 data. + b64_len: usize, + // Since the caller may provide us with a buffer of size 1 or 2 that's too small to copy a + // decoded chunk in to, we have to be able to hang on to a few decoded bytes. + // Technically we only need to hold 2 bytes but then we'd need a separate temporary buffer to + // decode 3 bytes into and then juggle copying one byte into the provided read buf and the rest + // into here, which seems like a lot of complexity for 1 extra byte of storage. + decoded_buffer: [u8; 3], + // index of start of decoded data + decoded_offset: usize, + // length of decoded data + decoded_len: usize, + // used to provide accurate offsets in errors + total_b64_decoded: usize, } -impl<'a, R: Read> fmt::Debug for DecoderReader<'a, R> { +impl<'a, R: io::Read> fmt::Debug for DecoderReader<'a, R> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("DecoderReader") .field("config", &self.config) - .field("buffer_offset", &self.buffer_offset) - .field("buffer_amount", &self.buffer_amount) + .field("b64_offset", &self.b64_offset) + .field("b64_len", &self.b64_len) + .field("decoded_buffer", &self.decoded_buffer) + .field("decoded_offset", &self.decoded_offset) + .field("decoded_len", &self.decoded_len) + .field("total_b64_decoded", &self.total_b64_decoded) .finish() } } -impl<'a, R: Read> DecoderReader<'a, R> { +impl<'a, R: io::Read> DecoderReader<'a, R> { /// Create a new decoder that will read from the provided reader `r`. pub fn new(r: &'a mut R, config: Config) -> Self { DecoderReader { config, r, - buffer: [0; BUF_SIZE], - buffer_offset: 0, - buffer_amount: 0, + b64_buffer: [0; BUF_SIZE], + b64_offset: 0, + b64_len: 0, + decoded_buffer: [0; DECODED_CHUNK_SIZE], + decoded_offset: 0, + decoded_len: 0, + total_b64_decoded: 0, } } + + /// Write as much as possible of the decoded buffer into the target buffer. + /// Must only be called when there is something to write and space to write into. + /// Returns a Result with the number of (decoded) bytes copied. + fn flush_decoded_buf(&mut self, buf: &mut [u8]) -> io::Result { + debug_assert!(self.decoded_len > 0); + debug_assert!(buf.len() > 0); + + let copy_len = cmp::min(self.decoded_len, buf.len()); + debug_assert!(copy_len > 0); + debug_assert!(copy_len <= self.decoded_len); + + buf[..copy_len].copy_from_slice( + &self.decoded_buffer[self.decoded_offset..self.decoded_offset + copy_len], + ); + + self.decoded_offset += copy_len; + self.decoded_len -= copy_len; + + debug_assert!(self.decoded_len < DECODED_CHUNK_SIZE); + + Ok(copy_len) + } + + /// Read into the remaining space in the buffer after the current contents. + /// Must only be called when there is space to read into in the buffer. + /// Returns the number of bytes read. + fn read_from_delegate(&mut self) -> io::Result { + debug_assert!(self.b64_offset + self.b64_len < BUF_SIZE); + + let read = self + .r + .read(&mut self.b64_buffer[self.b64_offset + self.b64_len..])?; + self.b64_len += read; + + debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE); + + return Ok(read); + } + + /// Decode the requested number of bytes from the b64 buffer into the provided buffer. It's the + /// caller's responsibility to choose the number of b64 bytes to decode correctly. + /// + /// Returns a Result with the number of decoded bytes written. + fn decode_to_buf(&mut self, num_bytes: usize, buf: &mut [u8]) -> io::Result { + debug_assert!(self.b64_len >= num_bytes); + debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE); + debug_assert!(buf.len() > 0); + + let decoded = decode_config_slice( + &self.b64_buffer[self.b64_offset..self.b64_offset + num_bytes], + self.config, + &mut buf[..], + ) + .map_err(|e| match e { + DecodeError::InvalidByte(offset, byte) => { + DecodeError::InvalidByte(self.total_b64_decoded + offset, byte) + } + DecodeError::InvalidLength => DecodeError::InvalidLength, + DecodeError::InvalidLastSymbol(offset, byte) => { + DecodeError::InvalidLastSymbol(self.total_b64_decoded + offset, byte) + } + }) + .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; + + self.total_b64_decoded += num_bytes; + self.b64_len -= num_bytes; + self.b64_offset += num_bytes; + + debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE); + + Ok(decoded) + } } impl<'a, R: Read> Read for DecoderReader<'a, R> { /// Decode input from the wrapped reader. /// /// Under non-error circumstances, this returns `Ok` with the value being the number of bytes - /// returned in `buf`. + /// written in `buf`. /// - /// If the caller performs a short read, then this function reads in a large chunk of data, - /// decodes that, and buffers the result. The intent is to amortize the decoding cost - /// of many small reads. + /// Where possible, this function buffers base64 to minimize the number of read() calls to the + /// delegate reader. /// /// # Errors /// - /// Any errors emitted by the delegate reader are returned. - fn read(&mut self, buf: &mut [u8]) -> Result { - // Check some invariants. - assert!(self.buffer_offset < BUF_SIZE); - assert!(self.buffer_amount <= BUF_SIZE); - assert!(self.buffer_offset + self.buffer_amount <= BUF_SIZE); - - if self.buffer_amount > 0 { - // We have something buffered, use that. - - let amount = cmp::min(buf.len(), self.buffer_amount); - buf[..amount] - .copy_from_slice(&self.buffer[self.buffer_offset..self.buffer_offset + amount]); - self.buffer_offset += amount; - self.buffer_amount -= amount; - - Ok(amount) - } else if buf.len() >= 2 * DECODED_CHUNK_SIZE { - // The caller wants at least two chunks. Round down to a - // multiple of the chunk size and decode directly into the - // caller-provided buffer. - let base64_bytes = cmp::min( - BUF_SIZE, - (buf.len() / DECODED_CHUNK_SIZE) * BASE64_CHUNK_SIZE, - ); - assert!(base64_bytes > 0); - - // TODO what if read provides less data than we asked for? - // borrow our buffer since it has nothing of value in it - let read = self.r.read(&mut self.buffer[..base64_bytes])?; - - // TODO only decode complete chunks - let decoded = decode_config_slice(&self.buffer[..read], self.config, buf) - .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; - // TODO keep track of any data we didn't decode (i.e. incomplete chunks) - - Ok(decoded) + /// Any errors emitted by the delegate reader are returned. Decoding errors due to invalid + /// base64 are also possible, and will have `io::ErrorKind::InvalidData`. + fn read(&mut self, buf: &mut [u8]) -> io::Result { + if buf.len() == 0 { + return Ok(0); + } + + // offset == BUF_SIZE when we copied it all last time + debug_assert!(self.b64_offset <= BUF_SIZE); + debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE); + debug_assert!(if self.b64_offset == BUF_SIZE { + self.b64_len == 0 + } else { + self.b64_len <= BUF_SIZE + }); + + debug_assert!(if self.decoded_len == 0 { + // can be = when we were able to copy the complete chunk + self.decoded_offset <= DECODED_CHUNK_SIZE } else { - // The caller wants less than a chunk of decoded data - // (i.e., one or two bytes). We have to buffer something. - // Double buffer a large amount in case short reads turn - // out to be common. + self.decoded_offset < DECODED_CHUNK_SIZE + }); + + // We shouldn't ever decode into here when we can't immediately write at least one byte into + // the provided buf, so the effective length should only be 3 momentarily between when we + // decode and when we copy into the target buffer. + debug_assert!(self.decoded_len < DECODED_CHUNK_SIZE); + debug_assert!(self.decoded_len + self.decoded_offset <= DECODED_CHUNK_SIZE); + + if self.decoded_len > 0 { + // we have a few leftover decoded bytes; flush that rather than pull in more b64 + self.flush_decoded_buf(buf) + } else { + let mut at_eof = false; + while self.b64_len < BASE64_CHUNK_SIZE { + // scoot the few bytes we have to the beginning + self.b64_buffer + .copy_within(self.b64_offset..self.b64_offset + self.b64_len, 0); + self.b64_offset = 0; + + // then fill in more data + let read = self.read_from_delegate()?; + if read == 0 { + // we never pass in an empty buf, so 0 => we've hit EOF + at_eof = true; + break; + } + } + + if self.b64_len == 0 { + // we must be at EOF, and we have no data left to decode + return Ok(0); + }; + + debug_assert!(if at_eof { + // if we are at eof, we may not have a complete chunk + self.b64_len > 0 + } else { + // otherwise, we must have at least one chunk + self.b64_len >= BASE64_CHUNK_SIZE + }); + + if buf.len() < DECODED_CHUNK_SIZE { + // caller requested an annoyingly short read + debug_assert_eq!(0, self.decoded_len); + + // have to write to a tmp buf first to avoid double mutable borrow + let mut decoded_chunk = [0_u8; DECODED_CHUNK_SIZE]; + // if we are at eof, could have less than BASE64_CHUNK_SIZE + let to_decode = cmp::min(self.b64_len, BASE64_CHUNK_SIZE); - // TODO maybe store base64 in the buffer, and keep a separate decode buffer that holds - // only a decoded chunk (3 bytes) to handle very short read calls? + let decoded = self.decode_to_buf(to_decode, &mut decoded_chunk[..])?; + self.decoded_buffer[..decoded].copy_from_slice(&decoded_chunk[..decoded]); - let mut base64_data = [0u8; BUF_SIZE]; - let read = self.r.read(&mut base64_data)?; + self.decoded_offset = 0; + self.decoded_len = decoded; - let decoded = decode_config_slice(&base64_data[..read], self.config, &mut self.buffer) - .map_err(|e| Error::new(ErrorKind::InvalidData, e))?; + // can be less than 3 on last block due to padding + debug_assert!(decoded <= 3); - let returning = cmp::min(buf.len(), decoded); - buf[..returning].copy_from_slice(&self.buffer[..returning]); + self.flush_decoded_buf(buf) + } else { + let bytes_that_can_decode_into_buf = (buf.len() / DECODED_CHUNK_SIZE) + .checked_mul(BASE64_CHUNK_SIZE) + .expect("too many chunks"); + debug_assert!(bytes_that_can_decode_into_buf >= BASE64_CHUNK_SIZE); - self.buffer_offset = returning; - self.buffer_amount = decoded - returning; + let bytes_available_to_decode = if at_eof { + self.b64_len + } else { + // only use complete chunks + self.b64_len - self.b64_len % 4 + }; - Ok(returning) + let actual_decode_len = + cmp::min(bytes_that_can_decode_into_buf, bytes_available_to_decode); + self.decode_to_buf(actual_decode_len, buf) + } } } } diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs index 900358e4..95f46a3d 100644 --- a/src/read/decoder_tests.rs +++ b/src/read/decoder_tests.rs @@ -1,12 +1,12 @@ use std::io::{self, Read}; use rand::{Rng, RngCore}; -use std::iter; +use std::{cmp, iter}; use super::decoder::{DecoderReader, BUF_SIZE}; use crate::encode::encode_config_buf; use crate::tests::random_config; -use crate::STANDARD; +use crate::{decode_config_buf, DecodeError, STANDARD}; #[test] fn simple() { @@ -50,56 +50,6 @@ fn simple() { } } -#[test] -fn big() { - let mut rng = rand::thread_rng(); - - for _ in 0..100 { - // The size. Not even multiples of BUF_SIZE. - let size = rng.gen_range(0, 10 * BUF_SIZE); - - // Create the random content. - let mut data_orig = Vec::with_capacity(size); - for _ in 0..size { - data_orig.push(rng.gen()); - } - - let config = random_config(&mut rng); - - // Encode it. - let mut encoded = String::new(); - encode_config_buf(&data_orig, config, &mut encoded); - - // Amount to read at a time: small, medium, large and XL. - for &n in &[ - rng.gen_range(1, 10), - rng.gen_range(1, 100), - rng.gen_range(1, BUF_SIZE), - BUF_SIZE + 1, - ] { - let mut wrapped_reader = io::Cursor::new(encoded.clone()); - let mut decoder = DecoderReader::new(&mut wrapped_reader, config); - - let mut data_got = Vec::new(); - let mut buffer = vec![0u8; n]; - while let Ok(read) = decoder.read(&mut buffer[..]) { - if read == 0 { - break; - } - data_got.extend_from_slice(&buffer[..read]); - } - - if data_got != data_orig { - panic!( - "\nGot: {}\nExpected: {}", - String::from_utf8_lossy(&data_got[..]), - String::from_utf8_lossy(&data_orig[..]) - ); - } - } - } -} - // Make sure we error out on trailing junk. #[test] fn trailing_junk() { @@ -116,7 +66,10 @@ fn trailing_junk() { let mut saw_error = false; loop { match decoder.read(&mut buffer[..]) { - Err(_) => saw_error = true, + Err(_) => { + saw_error = true; + break; + } Ok(read) if read == 0 => break, Ok(_) => (), } @@ -128,7 +81,7 @@ fn trailing_junk() { } #[test] -fn handles_short_read() { +fn handles_short_read_from_delegate() { let mut rng = rand::thread_rng(); let mut bytes = Vec::new(); let mut b64 = String::new(); @@ -159,11 +112,165 @@ fn handles_short_read() { let decoded_len = decoder.read_to_end(&mut decoded).unwrap(); assert_eq!(size, decoded_len); assert_eq!(&bytes[..], &decoded[..]); + } +} + +#[test] +fn read_in_short_increments() { + let mut rng = rand::thread_rng(); + let mut bytes = Vec::new(); + let mut b64 = String::new(); + let mut decoded = Vec::new(); + + for _ in 0..10_000 { + bytes.clear(); + b64.clear(); + decoded.clear(); + + let size = rng.gen_range(0, 10 * BUF_SIZE); + bytes.extend(iter::repeat(0).take(size)); + // leave room to play around with larger buffers + decoded.extend(iter::repeat(0).take(size * 3)); + + rng.fill_bytes(&mut bytes[..]); + assert_eq!(size, bytes.len()); + + let config = random_config(&mut rng); + + encode_config_buf(&bytes[..], config, &mut b64); + + let mut wrapped_reader = io::Cursor::new(&b64[..]); + let mut decoder = DecoderReader::new(&mut wrapped_reader, config); + + let mut total_read = 0_usize; + loop { + assert!(total_read <= size, "tr {} size {}", total_read, size); + if total_read == size { + assert_eq!(bytes, &decoded[..total_read]); + // should be done + assert_eq!(0, decoder.read(&mut decoded[..]).unwrap()); + // didn't write anything + assert_eq!(bytes, &decoded[..total_read]); + + break; + } + let decode_len = rng.gen_range(1, cmp::max(2, size * 2)); + + let read = decoder + .read(&mut decoded[total_read..total_read + decode_len]) + .unwrap(); + total_read += read; + } + } +} + +#[test] +fn reports_invalid_last_symbol_correctly() { + let mut rng = rand::thread_rng(); + let mut bytes = Vec::new(); + let mut b64 = String::new(); + let mut b64_bytes = Vec::new(); + let mut decoded = Vec::new(); + let mut bulk_decoded = Vec::new(); + + for _ in 0..1_000 { + bytes.clear(); + b64.clear(); + b64_bytes.clear(); + + let size = rng.gen_range(1, 10 * BUF_SIZE); + bytes.extend(iter::repeat(0).take(size)); + decoded.extend(iter::repeat(0).take(size)); + rng.fill_bytes(&mut bytes[..]); + assert_eq!(size, bytes.len()); + + let mut config = random_config(&mut rng); + // changing padding will cause invalid padding errors when we twiddle the last byte + config.pad = false; + + encode_config_buf(&bytes[..], config, &mut b64); + b64_bytes.extend(b64.bytes()); + assert_eq!(b64_bytes.len(), b64.len()); + + // change the last character to every possible symbol. Should behave the same as bulk + // decoding whether invalid or valid. + for &s1 in config.char_set.encode_table().iter() { + decoded.clear(); + bulk_decoded.clear(); + + // replace the last + *b64_bytes.last_mut().unwrap() = s1; + let bulk_res = decode_config_buf(&b64_bytes[..], config, &mut bulk_decoded); + + let mut wrapped_reader = io::Cursor::new(&b64_bytes[..]); + let mut decoder = DecoderReader::new(&mut wrapped_reader, config); + + let stream_res = decoder.read_to_end(&mut decoded).map(|_| ()).map_err(|e| { + e.into_inner() + .and_then(|e| e.downcast::().ok()) + }); + + assert_eq!(bulk_res.map_err(|e| Some(Box::new(e))), stream_res); + } + } +} + +#[test] +fn reports_invalid_byte_correctly() { + let mut rng = rand::thread_rng(); + let mut bytes = Vec::new(); + let mut b64 = String::new(); + let mut decoded = Vec::new(); + + for _ in 0..10_000 { + bytes.clear(); + b64.clear(); + decoded.clear(); + + let size = rng.gen_range(1, 10 * BUF_SIZE); + bytes.extend(iter::repeat(0).take(size)); + rng.fill_bytes(&mut bytes[..size]); + assert_eq!(size, bytes.len()); + + let config = random_config(&mut rng); + encode_config_buf(&bytes[..], config, &mut b64); + // replace one byte, somewhere, with '*', which is invalid + let bad_byte_pos = rng.gen_range(0, &b64.len()); + let mut b64_bytes = b64.bytes().collect::>(); + b64_bytes[bad_byte_pos] = b'*'; + + let mut wrapped_reader = io::Cursor::new(b64_bytes.clone()); + let mut decoder = DecoderReader::new(&mut wrapped_reader, config); + + // some gymnastics to avoid double-moving the io::Error, which is not Copy + let read_decode_err = decoder + .read_to_end(&mut decoded) + .map_err(|e| { + let kind = e.kind(); + let inner = e + .into_inner() + .and_then(|e| e.downcast::().ok()); + inner.map(|i| (*i, kind)) + }) + .err() + .and_then(|o| o); + + let mut bulk_buf = Vec::new(); + let bulk_decode_err = decode_config_buf(&b64_bytes[..], config, &mut bulk_buf).err(); + // it's tricky to predict where the invalid data's offset will be since if it's in the last + // chunk it will be reported at the first padding location because it's treated as invalid + // padding. So, we just check that it's the same as it is for decoding all at once. + assert_eq!( + bulk_decode_err.map(|e| (e, io::ErrorKind::InvalidData)), + read_decode_err + ); } } -// exercise code that depends on +/// Limits how many bytes a reader will provide in each read call. +/// Useful for shaking out code that may work fine only with typical input sources that always fill +/// the buffer. struct RandomShortRead<'a, 'b, R: io::Read, N: rand::Rng> { delegate: &'b mut R, rng: &'a mut N, @@ -171,8 +278,8 @@ struct RandomShortRead<'a, 'b, R: io::Read, N: rand::Rng> { impl<'a, 'b, R: io::Read, N: rand::Rng> io::Read for RandomShortRead<'a, 'b, R, N> { fn read(&mut self, buf: &mut [u8]) -> Result { - let effective_len = self.rng.gen_range(0, 20); - println!("buf {} effective {}", buf.len(), effective_len); + // avoid 0 since it means EOF for non-empty buffers + let effective_len = self.rng.gen_range(1, 20); self.delegate.read(&mut buf[..effective_len]) } From 0585b0324fc30111ad2ff2b43476f1413b5fbf28 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Sun, 24 Nov 2019 14:30:30 -0700 Subject: [PATCH 6/7] Fix 1.34.0 compatibility --- src/read/decoder.rs | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/read/decoder.rs b/src/read/decoder.rs index 9fa28cd7..8485ffaf 100644 --- a/src/read/decoder.rs +++ b/src/read/decoder.rs @@ -204,9 +204,14 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { } else { let mut at_eof = false; while self.b64_len < BASE64_CHUNK_SIZE { - // scoot the few bytes we have to the beginning - self.b64_buffer - .copy_within(self.b64_offset..self.b64_offset + self.b64_len, 0); + // Work around lack of copy_within, which is only present in 1.37 + // Copy any bytes we have to the start of the buffer. + // We know we have < 1 chunk, so we can use a tiny tmp buffer. + let mut memmove_buf = [0_u8; BASE64_CHUNK_SIZE]; + memmove_buf[..self.b64_len].copy_from_slice( + &self.b64_buffer[self.b64_offset..self.b64_offset + self.b64_len], + ); + self.b64_buffer[0..self.b64_len].copy_from_slice(&memmove_buf[..self.b64_len]); self.b64_offset = 0; // then fill in more data From 0a020eec1ccd64a79d41b76779fe0f03b8e84a40 Mon Sep 17 00:00:00 2001 From: Marshall Pierce Date: Sun, 8 Mar 2020 15:04:45 -0600 Subject: [PATCH 7/7] Add another test --- src/read/decoder.rs | 25 ++++++----- src/read/decoder_tests.rs | 87 ++++++++++++++++++++++++++++++--------- 2 files changed, 83 insertions(+), 29 deletions(-) diff --git a/src/read/decoder.rs b/src/read/decoder.rs index 8485ffaf..7a9c4cd2 100644 --- a/src/read/decoder.rs +++ b/src/read/decoder.rs @@ -126,7 +126,7 @@ impl<'a, R: io::Read> DecoderReader<'a, R> { /// Decode the requested number of bytes from the b64 buffer into the provided buffer. It's the /// caller's responsibility to choose the number of b64 bytes to decode correctly. /// - /// Returns a Result with the number of decoded bytes written. + /// Returns a Result with the number of decoded bytes written to `buf`. fn decode_to_buf(&mut self, num_bytes: usize, buf: &mut [u8]) -> io::Result { debug_assert!(self.b64_len >= num_bytes); debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE); @@ -149,8 +149,8 @@ impl<'a, R: io::Read> DecoderReader<'a, R> { .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?; self.total_b64_decoded += num_bytes; - self.b64_len -= num_bytes; self.b64_offset += num_bytes; + self.b64_len -= num_bytes; debug_assert!(self.b64_offset + self.b64_len <= BUF_SIZE); @@ -224,6 +224,7 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { } if self.b64_len == 0 { + debug_assert!(at_eof); // we must be at EOF, and we have no data left to decode return Ok(0); }; @@ -236,13 +237,15 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { self.b64_len >= BASE64_CHUNK_SIZE }); + debug_assert_eq!(0, self.decoded_len); + if buf.len() < DECODED_CHUNK_SIZE { // caller requested an annoyingly short read - debug_assert_eq!(0, self.decoded_len); - // have to write to a tmp buf first to avoid double mutable borrow let mut decoded_chunk = [0_u8; DECODED_CHUNK_SIZE]; - // if we are at eof, could have less than BASE64_CHUNK_SIZE + // if we are at eof, could have less than BASE64_CHUNK_SIZE, in which case we have + // to assume that these last few tokens are, in fact, valid (i.e. must be 2-4 b64 + // tokens, not 1, since 1 token can't decode to 1 byte). let to_decode = cmp::min(self.b64_len, BASE64_CHUNK_SIZE); let decoded = self.decode_to_buf(to_decode, &mut decoded_chunk[..])?; @@ -256,20 +259,22 @@ impl<'a, R: Read> Read for DecoderReader<'a, R> { self.flush_decoded_buf(buf) } else { - let bytes_that_can_decode_into_buf = (buf.len() / DECODED_CHUNK_SIZE) + let b64_bytes_that_can_decode_into_buf = (buf.len() / DECODED_CHUNK_SIZE) .checked_mul(BASE64_CHUNK_SIZE) .expect("too many chunks"); - debug_assert!(bytes_that_can_decode_into_buf >= BASE64_CHUNK_SIZE); + debug_assert!(b64_bytes_that_can_decode_into_buf >= BASE64_CHUNK_SIZE); - let bytes_available_to_decode = if at_eof { + let b64_bytes_available_to_decode = if at_eof { self.b64_len } else { // only use complete chunks self.b64_len - self.b64_len % 4 }; - let actual_decode_len = - cmp::min(bytes_that_can_decode_into_buf, bytes_available_to_decode); + let actual_decode_len = cmp::min( + b64_bytes_that_can_decode_into_buf, + b64_bytes_available_to_decode, + ); self.decode_to_buf(actual_decode_len, buf) } } diff --git a/src/read/decoder_tests.rs b/src/read/decoder_tests.rs index 95f46a3d..265d423a 100644 --- a/src/read/decoder_tests.rs +++ b/src/read/decoder_tests.rs @@ -142,25 +142,42 @@ fn read_in_short_increments() { let mut wrapped_reader = io::Cursor::new(&b64[..]); let mut decoder = DecoderReader::new(&mut wrapped_reader, config); - let mut total_read = 0_usize; - loop { - assert!(total_read <= size, "tr {} size {}", total_read, size); - if total_read == size { - assert_eq!(bytes, &decoded[..total_read]); - // should be done - assert_eq!(0, decoder.read(&mut decoded[..]).unwrap()); - // didn't write anything - assert_eq!(bytes, &decoded[..total_read]); - - break; - } - let decode_len = rng.gen_range(1, cmp::max(2, size * 2)); + consume_with_short_reads_and_validate(&mut rng, &bytes[..], &mut decoded, &mut decoder); + } +} - let read = decoder - .read(&mut decoded[total_read..total_read + decode_len]) - .unwrap(); - total_read += read; - } +#[test] +fn read_in_short_increments_with_short_delegate_reads() { + let mut rng = rand::thread_rng(); + let mut bytes = Vec::new(); + let mut b64 = String::new(); + let mut decoded = Vec::new(); + + for _ in 0..10_000 { + bytes.clear(); + b64.clear(); + decoded.clear(); + + let size = rng.gen_range(0, 10 * BUF_SIZE); + bytes.extend(iter::repeat(0).take(size)); + // leave room to play around with larger buffers + decoded.extend(iter::repeat(0).take(size * 3)); + + rng.fill_bytes(&mut bytes[..]); + assert_eq!(size, bytes.len()); + + let config = random_config(&mut rng); + + encode_config_buf(&bytes[..], config, &mut b64); + + let mut base_reader = io::Cursor::new(&b64[..]); + let mut decoder = DecoderReader::new(&mut base_reader, config); + let mut short_reader = RandomShortRead { + delegate: &mut decoder, + rng: &mut rand::thread_rng(), + }; + + consume_with_short_reads_and_validate(&mut rng, &bytes[..], &mut decoded, &mut short_reader) } } @@ -268,6 +285,38 @@ fn reports_invalid_byte_correctly() { } } +fn consume_with_short_reads_and_validate( + rng: &mut rand::rngs::ThreadRng, + expected_bytes: &[u8], + decoded: &mut Vec, + short_reader: &mut R, +) -> () { + let mut total_read = 0_usize; + loop { + assert!( + total_read <= expected_bytes.len(), + "tr {} size {}", + total_read, + expected_bytes.len() + ); + if total_read == expected_bytes.len() { + assert_eq!(expected_bytes, &decoded[..total_read]); + // should be done + assert_eq!(0, short_reader.read(&mut decoded[..]).unwrap()); + // didn't write anything + assert_eq!(expected_bytes, &decoded[..total_read]); + + break; + } + let decode_len = rng.gen_range(1, cmp::max(2, expected_bytes.len() * 2)); + + let read = short_reader + .read(&mut decoded[total_read..total_read + decode_len]) + .unwrap(); + total_read += read; + } +} + /// Limits how many bytes a reader will provide in each read call. /// Useful for shaking out code that may work fine only with typical input sources that always fill /// the buffer. @@ -279,7 +328,7 @@ struct RandomShortRead<'a, 'b, R: io::Read, N: rand::Rng> { impl<'a, 'b, R: io::Read, N: rand::Rng> io::Read for RandomShortRead<'a, 'b, R, N> { fn read(&mut self, buf: &mut [u8]) -> Result { // avoid 0 since it means EOF for non-empty buffers - let effective_len = self.rng.gen_range(1, 20); + let effective_len = cmp::min(self.rng.gen_range(1, 20), buf.len()); self.delegate.read(&mut buf[..effective_len]) }