Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
185 changes: 108 additions & 77 deletions src/uu/base32/src/base_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ use std::io::{self, ErrorKind, Read, Seek};
use std::path::{Path, PathBuf};
use uucore::display::Quotable;
use uucore::encoding::{
BASE2LSBF, BASE2MSBF, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
BASE2LSBF, BASE2MSBF, Base32Wrapper, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
SupportsFastDecodeAndEncode, Z85Wrapper,
for_base_common::{BASE32, BASE32HEX, BASE64URL, HEXUPPER_PERMISSIVE},
};
Expand Down Expand Up @@ -247,14 +247,14 @@ pub fn get_supports_fast_decode_and_encode(
// spell-checker:disable-next-line
b"01",
)),
Format::Base32 => Box::from(EncodingWrapper::new(
Format::Base32 => Box::from(Base32Wrapper::new(
BASE32,
BASE32_VALID_DECODING_MULTIPLE,
BASE32_UNPADDED_MULTIPLE,
// spell-checker:disable-next-line
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=",
)),
Format::Base32Hex => Box::from(EncodingWrapper::new(
Format::Base32Hex => Box::from(Base32Wrapper::new(
BASE32HEX,
BASE32_VALID_DECODING_MULTIPLE,
BASE32_UNPADDED_MULTIPLE,
Expand Down Expand Up @@ -502,43 +502,21 @@ pub mod fast_encode {

pub mod fast_decode {
use std::io::{self, Write};
use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
use uucore::{
encoding::SupportsFastDecodeAndEncode,
error::{UResult, USimpleError},
};

// Start of helper functions
fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] {
// If `ignore_garbage` is enabled, all characters outside the alphabet are ignored
// If it is not enabled, only '\n' and '\r' are ignored
if ignore_garbage {
// Note: "false" here
let mut table = [false; 256];

// Pass through no characters except those in the alphabet
for ue in alphabet {
let us = usize::from(*ue);

// Should not have been set yet
assert!(!table[us]);

table[us] = true;
}

table
} else {
// Note: "true" here
let mut table = [true; 256];

// Pass through all characters except '\n' and '\r'
for ue in [b'\n', b'\r'] {
let us = usize::from(ue);
fn alphabet_lookup(alphabet: &[u8]) -> [bool; 256] {
// Precompute O(1) membership checks so we can validate every byte before decoding.
let mut table = [false; 256];

// Should not have been set yet
assert!(table[us]);

table[us] = false;
}

table
for &byte in alphabet {
table[usize::from(byte)] = true;
}

table
}

fn decode_in_chunks_to_buffer(
Expand All @@ -558,6 +536,36 @@ pub mod fast_decode {

Ok(())
}

fn flush_ready_chunks(
buffer: &mut Vec<u8>,
block_limit: usize,
valid_multiple: usize,
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
decoded_buffer: &mut Vec<u8>,
output: &mut dyn Write,
) -> UResult<()> {
while buffer.len() >= valid_multiple {
let take = buffer.len().min(block_limit);
let aligned_take = take - (take % valid_multiple);

if aligned_take < valid_multiple {
break;
}

decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer[..aligned_take],
decoded_buffer,
)?;

write_to_output(decoded_buffer, output)?;

buffer.drain(..aligned_take);
}

Ok(())
}
// End of helper functions

pub fn fast_decode(
Expand All @@ -569,22 +577,22 @@ pub mod fast_decode {
const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;

let alphabet = supports_fast_decode_and_encode.alphabet();
let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple()
* DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
let alphabet_table = alphabet_lookup(alphabet);
let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;

assert!(decode_in_chunks_of_size > 0);

// Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because
// "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before
// passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also
// allows execution to stay on the happy path in "data-encoding":
// https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756
// It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is
// false.
// Note that the alphabet constants above already include the padding characters
// TODO
// Precompute this
let table = alphabet_to_table(alphabet, ignore_garbage);
assert!(valid_multiple > 0);

if !ignore_garbage {
// Match GNU basenc: fail fast when any non alphabet/non newline slips through without -i.
if input
.iter()
.any(|&byte| byte != b'\n' && byte != b'\r' && !alphabet_table[usize::from(byte)])
{
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
}
}

// Start of buffers

Expand All @@ -595,34 +603,57 @@ pub mod fast_decode {

let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);

input
.iter()
.filter(|ch| table[usize::from(**ch)])
.for_each(|ch| {
buffer.push(*ch);
// How many bytes to steal from `read_buffer` to get
// `leftover_buffer` to the right size
if buffer.len() == decode_in_chunks_of_size {
assert_eq!(decode_in_chunks_of_size, buffer.len());
// Decode data in chunks, then place it in `decoded_buffer`
decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer,
&mut decoded_buffer,
)
.unwrap();
// Write all data in `decoded_buffer` to `output`
write_to_output(&mut decoded_buffer, output).unwrap();
buffer.clear();
}
});
// Cleanup
// `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed
{
// Decode all remaining encoded bytes, placing them in `decoded_buffer`
supports_fast_decode_and_encode.decode_into_vec(&buffer, &mut decoded_buffer)?;
let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();

for &byte in &input {
if byte == b'\n' || byte == b'\r' {
continue;
}

if alphabet_table[usize::from(byte)] {
buffer.push(byte);
} else if ignore_garbage {
continue;
} else {
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
}

if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
} else if buffer.len() == decode_in_chunks_of_size {
decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer,
&mut decoded_buffer,
)?;
write_to_output(&mut decoded_buffer, output)?;
buffer.clear();
}
}

if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
}

if !buffer.is_empty() {
let padded = supports_fast_decode_and_encode.pad_remainder(&buffer);
let final_chunk = padded.as_deref().unwrap_or(&buffer);

// Write all data in `decoded_buffer` to `output`
supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
write_to_output(&mut decoded_buffer, output)?;
}

Expand Down
78 changes: 78 additions & 0 deletions src/uucore/src/lib/features/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -245,6 +245,19 @@ pub trait SupportsFastDecodeAndEncode {
///
/// The decoding performed by `fast_decode` depends on this number being correct.
fn valid_decoding_multiple(&self) -> usize;

/// Whether the decoder can flush partial chunks (multiples of `valid_decoding_multiple`)
/// before seeing the full input. Defaults to `false` for encodings that must consume the
/// entire input (e.g. base58).
fn supports_partial_decode(&self) -> bool {
false
}

/// Gives encoding-specific logic a chance to pad a trailing, non-empty remainder
/// before the final decode attempt. The default implementation opts out.
fn pad_remainder(&self, _remainder: &[u8]) -> Option<Vec<u8>> {
None
}
}

impl SupportsFastDecodeAndEncode for Base58Wrapper {
Expand Down Expand Up @@ -504,3 +517,68 @@ impl SupportsFastDecodeAndEncode for EncodingWrapper {
self.unpadded_multiple
}
}

pub struct Base32Wrapper {
inner: EncodingWrapper,
}

impl Base32Wrapper {
pub fn new(
encoding: Encoding,
valid_decoding_multiple: usize,
unpadded_multiple: usize,
alphabet: &'static [u8],
) -> Self {
Self {
inner: EncodingWrapper::new(
encoding,
valid_decoding_multiple,
unpadded_multiple,
alphabet,
),
}
}
}

impl SupportsFastDecodeAndEncode for Base32Wrapper {
fn alphabet(&self) -> &'static [u8] {
self.inner.alphabet()
}

fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
self.inner.decode_into_vec(input, output)
}

fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
self.inner.encode_to_vec_deque(input, output)
}

fn unpadded_multiple(&self) -> usize {
self.inner.unpadded_multiple()
}

fn valid_decoding_multiple(&self) -> usize {
self.inner.valid_decoding_multiple()
}

fn pad_remainder(&self, remainder: &[u8]) -> Option<Vec<u8>> {
if remainder.is_empty() || remainder.contains(&b'=') {
return None;
}

const VALID_REMAINDERS: [usize; 4] = [2, 4, 5, 7];

if !VALID_REMAINDERS.contains(&remainder.len()) {
return None;
}

let mut padded = remainder.to_vec();
let missing = self.valid_decoding_multiple() - remainder.len();
padded.extend(std::iter::repeat_n(b'=', missing));
Some(padded)
}

fn supports_partial_decode(&self) -> bool {
true
}
}
56 changes: 56 additions & 0 deletions tests/by-util/test_basenc.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
// file that was distributed with this source code.

// spell-checker: ignore (encodings) lsbf msbf
// spell-checker: ignore autopad MFRGG MFRGGZDF abcdeabc baddecode

use uutests::{at_and_ucmd, new_ucmd};

Expand Down Expand Up @@ -112,6 +113,61 @@ fn test_base32hex_decode() {
.stdout_only("nice>base?");
}

#[test]
fn test_base32_autopad_short_quantum() {
new_ucmd!()
.args(&["--base32", "--decode"])
.pipe_in("MFRGG")
.succeeds()
.stdout_only("abc");
}

#[test]
fn test_base32_autopad_multiline_stream() {
new_ucmd!()
.args(&["--base32", "--decode"])
.pipe_in("MFRGGZDF\nMFRGG")
.succeeds()
.stdout_only("abcdeabc");
}

#[test]
fn test_base32_baddecode_keeps_prefix() {
let result = new_ucmd!()
.args(&["--base32", "--decode"])
.pipe_in("MFRGGZDF=")
.fails();
result.stdout_is("abcde");
assert!(
result
.stderr_str()
.starts_with("basenc: error: invalid input")
);
}

#[test]
fn test_base32hex_autopad_short_quantum() {
new_ucmd!()
.args(&["--base32hex", "--decode"])
.pipe_in("C5H66")
.succeeds()
.stdout_only("abc");
}

#[test]
fn test_base32hex_rejects_trailing_garbage() {
let result = new_ucmd!()
.args(&["--base32hex", "-d"])
.pipe_in("VNC0FKD5W")
.fails();
result.stdout_is("");
assert!(
result
.stderr_str()
.starts_with("basenc: error: invalid input")
);
}

#[test]
fn test_base16() {
new_ucmd!()
Expand Down
Loading