diff --git a/src/uu/base32/src/base_common.rs b/src/uu/base32/src/base_common.rs index 96d28e18926..65cadc7c3d0 100644 --- a/src/uu/base32/src/base_common.rs +++ b/src/uu/base32/src/base_common.rs @@ -8,11 +8,11 @@ use clap::{Arg, ArgAction, Command}; use std::ffi::OsString; use std::fs::File; -use std::io::{self, ErrorKind, Read, Seek}; +use std::io::{self, ErrorKind, Read, Seek, Write}; use std::path::{Path, PathBuf}; use uucore::display::Quotable; use uucore::encoding::{ - BASE2LSBF, BASE2MSBF, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format, + BASE2LSBF, BASE2MSBF, Base32Wrapper, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format, SupportsFastDecodeAndEncode, Z85Wrapper, for_base_common::{BASE32, BASE32HEX, BASE64URL, HEXUPPER_PERMISSIVE}, }; @@ -193,7 +193,7 @@ pub fn handle_input(input: &mut R, format: Format, config: Confi let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref(); let mut stdout_lock = io::stdout().lock(); - if config.decode { + let result = if config.decode { fast_decode::fast_decode( read, &mut stdout_lock, @@ -207,6 +207,14 @@ pub fn handle_input(input: &mut R, format: Format, config: Confi supports_fast_decode_and_encode_ref, config.wrap_cols, ) + }; + + // Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc + // keeps already-decoded bytes visible before reporting the error. + match (result, stdout_lock.flush()) { + (res, Ok(())) => res, + (Ok(_), Err(err)) => Err(err.into()), + (Err(original), Err(_)) => Err(original), } } @@ -247,14 +255,14 @@ pub fn get_supports_fast_decode_and_encode( // spell-checker:disable-next-line b"01", )), - Format::Base32 => Box::from(EncodingWrapper::new( + Format::Base32 => Box::from(Base32Wrapper::new( BASE32, BASE32_VALID_DECODING_MULTIPLE, BASE32_UNPADDED_MULTIPLE, // spell-checker:disable-next-line b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=", )), - Format::Base32Hex => Box::from(EncodingWrapper::new( + Format::Base32Hex => Box::from(Base32Wrapper::new( BASE32HEX, BASE32_VALID_DECODING_MULTIPLE, BASE32_UNPADDED_MULTIPLE, @@ -502,43 +510,21 @@ pub mod fast_encode { pub mod fast_decode { use std::io::{self, Write}; - use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult}; + use uucore::{ + encoding::SupportsFastDecodeAndEncode, + error::{UResult, USimpleError}, + }; // Start of helper functions - fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] { - // If `ignore_garbage` is enabled, all characters outside the alphabet are ignored - // If it is not enabled, only '\n' and '\r' are ignored - if ignore_garbage { - // Note: "false" here - let mut table = [false; 256]; - - // Pass through no characters except those in the alphabet - for ue in alphabet { - let us = usize::from(*ue); - - // Should not have been set yet - assert!(!table[us]); - - table[us] = true; - } + fn alphabet_lookup(alphabet: &[u8]) -> [bool; 256] { + // Precompute O(1) membership checks so we can validate every byte before decoding. + let mut table = [false; 256]; - table - } else { - // Note: "true" here - let mut table = [true; 256]; - - // Pass through all characters except '\n' and '\r' - for ue in [b'\n', b'\r'] { - let us = usize::from(ue); - - // Should not have been set yet - assert!(table[us]); - - table[us] = false; - } - - table + for &byte in alphabet { + table[usize::from(byte)] = true; } + + table } fn decode_in_chunks_to_buffer( @@ -553,11 +539,44 @@ pub mod fast_decode { fn write_to_output(decoded_buffer: &mut Vec, output: &mut dyn Write) -> io::Result<()> { // Write all data in `decoded_buffer` to `output` output.write_all(decoded_buffer.as_slice())?; + output.flush()?; decoded_buffer.clear(); Ok(()) } + + fn flush_ready_chunks( + buffer: &mut Vec, + block_limit: usize, + valid_multiple: usize, + supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode, + decoded_buffer: &mut Vec, + output: &mut dyn Write, + ) -> UResult<()> { + // While at least one full decode block is buffered, keep draining + // it and never yield more than block_limit per chunk. + while buffer.len() >= valid_multiple { + let take = buffer.len().min(block_limit); + let aligned_take = take - (take % valid_multiple); + + if aligned_take < valid_multiple { + break; + } + + decode_in_chunks_to_buffer( + supports_fast_decode_and_encode, + &buffer[..aligned_take], + decoded_buffer, + )?; + + write_to_output(decoded_buffer, output)?; + + buffer.drain(..aligned_take); + } + + Ok(()) + } // End of helper functions pub fn fast_decode( @@ -569,22 +588,12 @@ pub mod fast_decode { const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024; let alphabet = supports_fast_decode_and_encode.alphabet(); - let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple() - * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE; + let alphabet_table = alphabet_lookup(alphabet); + let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple(); + let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE; assert!(decode_in_chunks_of_size > 0); - - // Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because - // "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before - // passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also - // allows execution to stay on the happy path in "data-encoding": - // https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756 - // It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is - // false. - // Note that the alphabet constants above already include the padding characters - // TODO - // Precompute this - let table = alphabet_to_table(alphabet, ignore_garbage); + assert!(valid_multiple > 0); // Start of buffers @@ -595,35 +604,69 @@ pub mod fast_decode { let mut buffer = Vec::with_capacity(decode_in_chunks_of_size); - input - .iter() - .filter(|ch| table[usize::from(**ch)]) - .for_each(|ch| { - buffer.push(*ch); - // How many bytes to steal from `read_buffer` to get - // `leftover_buffer` to the right size - if buffer.len() == decode_in_chunks_of_size { - assert_eq!(decode_in_chunks_of_size, buffer.len()); - // Decode data in chunks, then place it in `decoded_buffer` - decode_in_chunks_to_buffer( - supports_fast_decode_and_encode, - &buffer, - &mut decoded_buffer, - ) - .unwrap(); - // Write all data in `decoded_buffer` to `output` - write_to_output(&mut decoded_buffer, output).unwrap(); - buffer.clear(); - } - }); - // Cleanup - // `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed - { - // Decode all remaining encoded bytes, placing them in `decoded_buffer` - supports_fast_decode_and_encode.decode_into_vec(&buffer, &mut decoded_buffer)?; + let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode(); - // Write all data in `decoded_buffer` to `output` + for &byte in &input { + if byte == b'\n' || byte == b'\r' { + continue; + } + + if alphabet_table[usize::from(byte)] { + buffer.push(byte); + } else if ignore_garbage { + continue; + } else { + return Err(USimpleError::new(1, "error: invalid input".to_owned())); + } + + if supports_partial_decode { + flush_ready_chunks( + &mut buffer, + decode_in_chunks_of_size, + valid_multiple, + supports_fast_decode_and_encode, + &mut decoded_buffer, + output, + )?; + } else if buffer.len() == decode_in_chunks_of_size { + decode_in_chunks_to_buffer( + supports_fast_decode_and_encode, + &buffer, + &mut decoded_buffer, + )?; + write_to_output(&mut decoded_buffer, output)?; + buffer.clear(); + } + } + + if supports_partial_decode { + flush_ready_chunks( + &mut buffer, + decode_in_chunks_of_size, + valid_multiple, + supports_fast_decode_and_encode, + &mut decoded_buffer, + output, + )?; + } + + if !buffer.is_empty() { + let mut owned_chunk: Option> = None; + let mut had_invalid_tail = false; + + if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) { + had_invalid_tail = pad_result.had_invalid_tail; + owned_chunk = Some(pad_result.chunk); + } + + let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer); + + supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?; write_to_output(&mut decoded_buffer, output)?; + + if had_invalid_tail { + return Err(USimpleError::new(1, "error: invalid input".to_owned())); + } } Ok(()) diff --git a/src/uucore/src/lib/features/encoding.rs b/src/uucore/src/lib/features/encoding.rs index 6c6261c2cbf..2f7caae2b78 100644 --- a/src/uucore/src/lib/features/encoding.rs +++ b/src/uucore/src/lib/features/encoding.rs @@ -214,6 +214,11 @@ impl EncodingWrapper { } } +pub struct PadResult { + pub chunk: Vec, + pub had_invalid_tail: bool, +} + pub trait SupportsFastDecodeAndEncode { /// Returns the list of characters used by this encoding fn alphabet(&self) -> &'static [u8]; @@ -245,6 +250,19 @@ pub trait SupportsFastDecodeAndEncode { /// /// The decoding performed by `fast_decode` depends on this number being correct. fn valid_decoding_multiple(&self) -> usize; + + /// Whether the decoder can flush partial chunks (multiples of `valid_decoding_multiple`) + /// before seeing the full input. Defaults to `false` for encodings that must consume the + /// entire input (e.g. base58). + fn supports_partial_decode(&self) -> bool { + false + } + + /// Gives encoding-specific logic a chance to pad a trailing, non-empty remainder + /// before the final decode attempt. The default implementation opts out. + fn pad_remainder(&self, _remainder: &[u8]) -> Option { + None + } } impl SupportsFastDecodeAndEncode for Base58Wrapper { @@ -504,3 +522,80 @@ impl SupportsFastDecodeAndEncode for EncodingWrapper { self.unpadded_multiple } } + +pub struct Base32Wrapper { + inner: EncodingWrapper, +} + +impl Base32Wrapper { + pub fn new( + encoding: Encoding, + valid_decoding_multiple: usize, + unpadded_multiple: usize, + alphabet: &'static [u8], + ) -> Self { + Self { + inner: EncodingWrapper::new( + encoding, + valid_decoding_multiple, + unpadded_multiple, + alphabet, + ), + } + } +} + +impl SupportsFastDecodeAndEncode for Base32Wrapper { + fn alphabet(&self) -> &'static [u8] { + self.inner.alphabet() + } + + fn decode_into_vec(&self, input: &[u8], output: &mut Vec) -> UResult<()> { + self.inner.decode_into_vec(input, output) + } + + fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque) -> UResult<()> { + self.inner.encode_to_vec_deque(input, output) + } + + fn unpadded_multiple(&self) -> usize { + self.inner.unpadded_multiple() + } + + fn valid_decoding_multiple(&self) -> usize { + self.inner.valid_decoding_multiple() + } + + fn pad_remainder(&self, remainder: &[u8]) -> Option { + if remainder.is_empty() || remainder.contains(&b'=') { + return None; + } + + const VALID_REMAINDERS: [usize; 4] = [2, 4, 5, 7]; + + let mut len = remainder.len(); + let mut trimmed = false; + + while len > 0 && !VALID_REMAINDERS.contains(&len) { + len -= 1; + trimmed = true; + } + + if len == 0 { + return None; + } + + let mut padded = remainder[..len].to_vec(); + let missing = self.valid_decoding_multiple() - padded.len(); + padded.extend(std::iter::repeat_n(b'=', missing)); + + Some(PadResult { + chunk: padded, + had_invalid_tail: trimmed, + }) + } + + fn supports_partial_decode(&self) -> bool { + true + } +} diff --git a/tests/by-util/test_basenc.rs b/tests/by-util/test_basenc.rs index f02de772b20..a3c92b8850e 100644 --- a/tests/by-util/test_basenc.rs +++ b/tests/by-util/test_basenc.rs @@ -4,6 +4,7 @@ // file that was distributed with this source code. // spell-checker: ignore (encodings) lsbf msbf +// spell-checker: ignore autopad MFRGG MFRGGZDF abcdeabc baddecode CPNMUO use uutests::{at_and_ucmd, new_ucmd}; @@ -112,6 +113,63 @@ fn test_base32hex_decode() { .stdout_only("nice>base?"); } +#[test] +fn test_base32_autopad_short_quantum() { + new_ucmd!() + .args(&["--base32", "--decode"]) + .pipe_in("MFRGG") + .succeeds() + .stdout_only("abc"); +} + +#[test] +fn test_base32_autopad_multiline_stream() { + new_ucmd!() + .args(&["--base32", "--decode"]) + .pipe_in("MFRGGZDF\nMFRGG") + .succeeds() + .stdout_only("abcdeabc"); +} + +#[test] +fn test_base32_baddecode_keeps_prefix() { + new_ucmd!() + .args(&["--base32", "--decode"]) + .pipe_in("MFRGGZDF=") + .fails() + .stdout_is("abcde") + .stderr_is("basenc: error: invalid input\n"); +} + +#[test] +fn test_base32hex_autopad_short_quantum() { + new_ucmd!() + .args(&["--base32hex", "--decode"]) + .pipe_in("C5H66") + .succeeds() + .stdout_only("abc"); +} + +#[test] +fn test_base32hex_rejects_trailing_garbage() { + new_ucmd!() + .args(&["--base32hex", "-d"]) + .pipe_in("VNC0FKD5W") + .fails() + .stdout_is_bytes(b"\xFD\xD8\x07\xD1\xA5") + .stderr_is("basenc: error: invalid input\n"); +} + +#[test] +fn test_base32hex_truncated_block_keeps_prefix() { + new_ucmd!() + .args(&["--base32hex", "-d"]) + .pipe_in("CPNMUO") + .fails() + .stdout_is_bytes(b"foo") + .stderr_is("basenc: error: invalid input\n"); +} + #[test] fn test_base16() { new_ucmd!() diff --git a/util/build-gnu.sh b/util/build-gnu.sh index 734088252f4..401bf624b18 100755 --- a/util/build-gnu.sh +++ b/util/build-gnu.sh @@ -268,7 +268,7 @@ sed -i -e "s|invalid suffix in --pages argument|invalid --pages argument|" \ # When decoding an invalid base32/64 string, gnu writes everything it was able to decode until # it hit the decode error, while we don't write anything if the input is invalid. sed -i "s/\(baddecode.*OUT=>\"\).*\"/\1\"/g" tests/basenc/base64.pl -sed -i "s/\(\(b2[ml]_[69]\|b32h_[56]\|z85_8\|z85_35\).*OUT=>\)[^}]*\(.*\)/\1\"\"\3/g" tests/basenc/basenc.pl +sed -i "s/\(\(b2[ml]_[69]\|z85_8\|z85_35\).*OUT=>\)[^}]*\(.*\)/\1\"\"\3/g" tests/basenc/basenc.pl # add "error: " to the expected error message sed -i "s/\$prog: invalid input/\$prog: error: invalid input/g" tests/basenc/basenc.pl