Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
201 changes: 122 additions & 79 deletions src/uu/base32/src/base_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
use clap::{Arg, ArgAction, Command};
use std::ffi::OsString;
use std::fs::File;
use std::io::{self, ErrorKind, Read, Seek};
use std::io::{self, ErrorKind, Read, Seek, Write};
use std::path::{Path, PathBuf};
use uucore::display::Quotable;
use uucore::encoding::{
BASE2LSBF, BASE2MSBF, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
BASE2LSBF, BASE2MSBF, Base32Wrapper, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
SupportsFastDecodeAndEncode, Z85Wrapper,
for_base_common::{BASE32, BASE32HEX, BASE64URL, HEXUPPER_PERMISSIVE},
};
Expand Down Expand Up @@ -193,7 +193,7 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi

let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref();
let mut stdout_lock = io::stdout().lock();
if config.decode {
let result = if config.decode {
fast_decode::fast_decode(
read,
&mut stdout_lock,
Expand All @@ -207,6 +207,14 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi
supports_fast_decode_and_encode_ref,
config.wrap_cols,
)
};

// Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc
// keeps already-decoded bytes visible before reporting the error.
match (result, stdout_lock.flush()) {
(res, Ok(())) => res,
(Ok(_), Err(err)) => Err(err.into()),
(Err(original), Err(_)) => Err(original),
}
}

Expand Down Expand Up @@ -247,14 +255,14 @@ pub fn get_supports_fast_decode_and_encode(
// spell-checker:disable-next-line
b"01",
)),
Format::Base32 => Box::from(EncodingWrapper::new(
Format::Base32 => Box::from(Base32Wrapper::new(
BASE32,
BASE32_VALID_DECODING_MULTIPLE,
BASE32_UNPADDED_MULTIPLE,
// spell-checker:disable-next-line
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=",
)),
Format::Base32Hex => Box::from(EncodingWrapper::new(
Format::Base32Hex => Box::from(Base32Wrapper::new(
BASE32HEX,
BASE32_VALID_DECODING_MULTIPLE,
BASE32_UNPADDED_MULTIPLE,
Expand Down Expand Up @@ -502,43 +510,21 @@ pub mod fast_encode {

pub mod fast_decode {
use std::io::{self, Write};
use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
use uucore::{
encoding::SupportsFastDecodeAndEncode,
error::{UResult, USimpleError},
};

// Start of helper functions
fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] {
// If `ignore_garbage` is enabled, all characters outside the alphabet are ignored
// If it is not enabled, only '\n' and '\r' are ignored
if ignore_garbage {
// Note: "false" here
let mut table = [false; 256];

// Pass through no characters except those in the alphabet
for ue in alphabet {
let us = usize::from(*ue);

// Should not have been set yet
assert!(!table[us]);

table[us] = true;
}
fn alphabet_lookup(alphabet: &[u8]) -> [bool; 256] {
// Precompute O(1) membership checks so we can validate every byte before decoding.
let mut table = [false; 256];

table
} else {
// Note: "true" here
let mut table = [true; 256];

// Pass through all characters except '\n' and '\r'
for ue in [b'\n', b'\r'] {
let us = usize::from(ue);

// Should not have been set yet
assert!(table[us]);

table[us] = false;
}

table
for &byte in alphabet {
table[usize::from(byte)] = true;
}

table
}

fn decode_in_chunks_to_buffer(
Expand All @@ -553,11 +539,44 @@ pub mod fast_decode {
fn write_to_output(decoded_buffer: &mut Vec<u8>, output: &mut dyn Write) -> io::Result<()> {
// Write all data in `decoded_buffer` to `output`
output.write_all(decoded_buffer.as_slice())?;
output.flush()?;

decoded_buffer.clear();

Ok(())
}

fn flush_ready_chunks(
buffer: &mut Vec<u8>,
block_limit: usize,
valid_multiple: usize,
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
decoded_buffer: &mut Vec<u8>,
output: &mut dyn Write,
) -> UResult<()> {
// While at least one full decode block is buffered, keep draining
// it and never yield more than block_limit per chunk.
while buffer.len() >= valid_multiple {
let take = buffer.len().min(block_limit);
let aligned_take = take - (take % valid_multiple);

if aligned_take < valid_multiple {
break;
}

decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer[..aligned_take],
decoded_buffer,
)?;

write_to_output(decoded_buffer, output)?;

buffer.drain(..aligned_take);
}

Ok(())
}
// End of helper functions

pub fn fast_decode(
Expand All @@ -569,22 +588,12 @@ pub mod fast_decode {
const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;

let alphabet = supports_fast_decode_and_encode.alphabet();
let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple()
* DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
let alphabet_table = alphabet_lookup(alphabet);
let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;

assert!(decode_in_chunks_of_size > 0);

// Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because
// "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before
// passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also
// allows execution to stay on the happy path in "data-encoding":
// https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756
// It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is
// false.
// Note that the alphabet constants above already include the padding characters
// TODO
// Precompute this
let table = alphabet_to_table(alphabet, ignore_garbage);
assert!(valid_multiple > 0);

// Start of buffers

Expand All @@ -595,35 +604,69 @@ pub mod fast_decode {

let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);

input
.iter()
.filter(|ch| table[usize::from(**ch)])
.for_each(|ch| {
buffer.push(*ch);
// How many bytes to steal from `read_buffer` to get
// `leftover_buffer` to the right size
if buffer.len() == decode_in_chunks_of_size {
assert_eq!(decode_in_chunks_of_size, buffer.len());
// Decode data in chunks, then place it in `decoded_buffer`
decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer,
&mut decoded_buffer,
)
.unwrap();
// Write all data in `decoded_buffer` to `output`
write_to_output(&mut decoded_buffer, output).unwrap();
buffer.clear();
}
});
// Cleanup
// `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed
{
// Decode all remaining encoded bytes, placing them in `decoded_buffer`
supports_fast_decode_and_encode.decode_into_vec(&buffer, &mut decoded_buffer)?;
let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();

// Write all data in `decoded_buffer` to `output`
for &byte in &input {
if byte == b'\n' || byte == b'\r' {
continue;
}

if alphabet_table[usize::from(byte)] {
buffer.push(byte);
} else if ignore_garbage {
continue;
} else {
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
}

if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
} else if buffer.len() == decode_in_chunks_of_size {
decode_in_chunks_to_buffer(
supports_fast_decode_and_encode,
&buffer,
&mut decoded_buffer,
)?;
write_to_output(&mut decoded_buffer, output)?;
buffer.clear();
}
}

if supports_partial_decode {
flush_ready_chunks(
&mut buffer,
decode_in_chunks_of_size,
valid_multiple,
supports_fast_decode_and_encode,
&mut decoded_buffer,
output,
)?;
}

if !buffer.is_empty() {
let mut owned_chunk: Option<Vec<u8>> = None;
let mut had_invalid_tail = false;

if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) {
had_invalid_tail = pad_result.had_invalid_tail;
owned_chunk = Some(pad_result.chunk);
}

let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer);

supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
write_to_output(&mut decoded_buffer, output)?;

if had_invalid_tail {
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
}
}

Ok(())
Expand Down
95 changes: 95 additions & 0 deletions src/uucore/src/lib/features/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -214,6 +214,11 @@ impl EncodingWrapper {
}
}

pub struct PadResult {
pub chunk: Vec<u8>,
pub had_invalid_tail: bool,
}

pub trait SupportsFastDecodeAndEncode {
/// Returns the list of characters used by this encoding
fn alphabet(&self) -> &'static [u8];
Expand Down Expand Up @@ -245,6 +250,19 @@ pub trait SupportsFastDecodeAndEncode {
///
/// The decoding performed by `fast_decode` depends on this number being correct.
fn valid_decoding_multiple(&self) -> usize;

/// Whether the decoder can flush partial chunks (multiples of `valid_decoding_multiple`)
/// before seeing the full input. Defaults to `false` for encodings that must consume the
/// entire input (e.g. base58).
fn supports_partial_decode(&self) -> bool {
false
}

/// Gives encoding-specific logic a chance to pad a trailing, non-empty remainder
/// before the final decode attempt. The default implementation opts out.
fn pad_remainder(&self, _remainder: &[u8]) -> Option<PadResult> {
None
}
}

impl SupportsFastDecodeAndEncode for Base58Wrapper {
Expand Down Expand Up @@ -504,3 +522,80 @@ impl SupportsFastDecodeAndEncode for EncodingWrapper {
self.unpadded_multiple
}
}

pub struct Base32Wrapper {
inner: EncodingWrapper,
}

impl Base32Wrapper {
pub fn new(
encoding: Encoding,
valid_decoding_multiple: usize,
unpadded_multiple: usize,
alphabet: &'static [u8],
) -> Self {
Self {
inner: EncodingWrapper::new(
encoding,
valid_decoding_multiple,
unpadded_multiple,
alphabet,
),
}
}
}

impl SupportsFastDecodeAndEncode for Base32Wrapper {
fn alphabet(&self) -> &'static [u8] {
self.inner.alphabet()
}

fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
self.inner.decode_into_vec(input, output)
}

fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
self.inner.encode_to_vec_deque(input, output)
}

fn unpadded_multiple(&self) -> usize {
self.inner.unpadded_multiple()
}

fn valid_decoding_multiple(&self) -> usize {
self.inner.valid_decoding_multiple()
}

fn pad_remainder(&self, remainder: &[u8]) -> Option<PadResult> {
if remainder.is_empty() || remainder.contains(&b'=') {
return None;
}

const VALID_REMAINDERS: [usize; 4] = [2, 4, 5, 7];

let mut len = remainder.len();
let mut trimmed = false;

while len > 0 && !VALID_REMAINDERS.contains(&len) {
len -= 1;
trimmed = true;
}

if len == 0 {
return None;
}

let mut padded = remainder[..len].to_vec();
let missing = self.valid_decoding_multiple() - padded.len();
padded.extend(std::iter::repeat_n(b'=', missing));

Some(PadResult {
chunk: padded,
had_invalid_tail: trimmed,
})
}

fn supports_partial_decode(&self) -> bool {
true
}
}
Loading
Loading