Skip to content

Commit 364d9e9

Browse files
authored
basenc: Fix basenc.pl GNU-compat tests pass (#9203)
* fix(basenc): align base32 decode with GNU * Add GNU-style basenc base32 tests * Expand basenc base32 tests and simplify failures Adds the GNU-style auto-padding/truncated cases to tests/by-util/test_basenc.rs and rewrites the failure assertions to use the chained fails().stdout_*(…).stderr_is(…) style for clarity. * Restore GNU expectations for b32h_5 and b32h_6 Updates util/build-gnu.sh to stop forcing those two basenc tests to expect empty stdout, so the GNU suite again checks for the leaked five bytes before failure. * Allow base32 decoder to auto-pad truncated blocks Introduce PadResult, trim/pad incomplete base32 chunks, emit decoded prefixes, and still return error: invalid input in line with GNU basenc.
1 parent eb223ba commit 364d9e9

File tree

4 files changed

+276
-80
lines changed

4 files changed

+276
-80
lines changed

src/uu/base32/src/base_common.rs

Lines changed: 122 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@
88
use clap::{Arg, ArgAction, Command};
99
use std::ffi::OsString;
1010
use std::fs::File;
11-
use std::io::{self, ErrorKind, Read, Seek};
11+
use std::io::{self, ErrorKind, Read, Seek, Write};
1212
use std::path::{Path, PathBuf};
1313
use uucore::display::Quotable;
1414
use uucore::encoding::{
15-
BASE2LSBF, BASE2MSBF, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
15+
BASE2LSBF, BASE2MSBF, Base32Wrapper, Base58Wrapper, Base64SimdWrapper, EncodingWrapper, Format,
1616
SupportsFastDecodeAndEncode, Z85Wrapper,
1717
for_base_common::{BASE32, BASE32HEX, BASE64URL, HEXUPPER_PERMISSIVE},
1818
};
@@ -193,7 +193,7 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi
193193

194194
let supports_fast_decode_and_encode_ref = supports_fast_decode_and_encode.as_ref();
195195
let mut stdout_lock = io::stdout().lock();
196-
if config.decode {
196+
let result = if config.decode {
197197
fast_decode::fast_decode(
198198
read,
199199
&mut stdout_lock,
@@ -207,6 +207,14 @@ pub fn handle_input<R: Read + Seek>(input: &mut R, format: Format, config: Confi
207207
supports_fast_decode_and_encode_ref,
208208
config.wrap_cols,
209209
)
210+
};
211+
212+
// Ensure any pending stdout buffer is flushed even if decoding failed; GNU basenc
213+
// keeps already-decoded bytes visible before reporting the error.
214+
match (result, stdout_lock.flush()) {
215+
(res, Ok(())) => res,
216+
(Ok(_), Err(err)) => Err(err.into()),
217+
(Err(original), Err(_)) => Err(original),
210218
}
211219
}
212220

@@ -247,14 +255,14 @@ pub fn get_supports_fast_decode_and_encode(
247255
// spell-checker:disable-next-line
248256
b"01",
249257
)),
250-
Format::Base32 => Box::from(EncodingWrapper::new(
258+
Format::Base32 => Box::from(Base32Wrapper::new(
251259
BASE32,
252260
BASE32_VALID_DECODING_MULTIPLE,
253261
BASE32_UNPADDED_MULTIPLE,
254262
// spell-checker:disable-next-line
255263
b"ABCDEFGHIJKLMNOPQRSTUVWXYZ234567=",
256264
)),
257-
Format::Base32Hex => Box::from(EncodingWrapper::new(
265+
Format::Base32Hex => Box::from(Base32Wrapper::new(
258266
BASE32HEX,
259267
BASE32_VALID_DECODING_MULTIPLE,
260268
BASE32_UNPADDED_MULTIPLE,
@@ -502,43 +510,21 @@ pub mod fast_encode {
502510

503511
pub mod fast_decode {
504512
use std::io::{self, Write};
505-
use uucore::{encoding::SupportsFastDecodeAndEncode, error::UResult};
513+
use uucore::{
514+
encoding::SupportsFastDecodeAndEncode,
515+
error::{UResult, USimpleError},
516+
};
506517

507518
// Start of helper functions
508-
fn alphabet_to_table(alphabet: &[u8], ignore_garbage: bool) -> [bool; 256] {
509-
// If `ignore_garbage` is enabled, all characters outside the alphabet are ignored
510-
// If it is not enabled, only '\n' and '\r' are ignored
511-
if ignore_garbage {
512-
// Note: "false" here
513-
let mut table = [false; 256];
514-
515-
// Pass through no characters except those in the alphabet
516-
for ue in alphabet {
517-
let us = usize::from(*ue);
518-
519-
// Should not have been set yet
520-
assert!(!table[us]);
521-
522-
table[us] = true;
523-
}
519+
fn alphabet_lookup(alphabet: &[u8]) -> [bool; 256] {
520+
// Precompute O(1) membership checks so we can validate every byte before decoding.
521+
let mut table = [false; 256];
524522

525-
table
526-
} else {
527-
// Note: "true" here
528-
let mut table = [true; 256];
529-
530-
// Pass through all characters except '\n' and '\r'
531-
for ue in [b'\n', b'\r'] {
532-
let us = usize::from(ue);
533-
534-
// Should not have been set yet
535-
assert!(table[us]);
536-
537-
table[us] = false;
538-
}
539-
540-
table
523+
for &byte in alphabet {
524+
table[usize::from(byte)] = true;
541525
}
526+
527+
table
542528
}
543529

544530
fn decode_in_chunks_to_buffer(
@@ -553,11 +539,44 @@ pub mod fast_decode {
553539
fn write_to_output(decoded_buffer: &mut Vec<u8>, output: &mut dyn Write) -> io::Result<()> {
554540
// Write all data in `decoded_buffer` to `output`
555541
output.write_all(decoded_buffer.as_slice())?;
542+
output.flush()?;
556543

557544
decoded_buffer.clear();
558545

559546
Ok(())
560547
}
548+
549+
fn flush_ready_chunks(
550+
buffer: &mut Vec<u8>,
551+
block_limit: usize,
552+
valid_multiple: usize,
553+
supports_fast_decode_and_encode: &dyn SupportsFastDecodeAndEncode,
554+
decoded_buffer: &mut Vec<u8>,
555+
output: &mut dyn Write,
556+
) -> UResult<()> {
557+
// While at least one full decode block is buffered, keep draining
558+
// it and never yield more than block_limit per chunk.
559+
while buffer.len() >= valid_multiple {
560+
let take = buffer.len().min(block_limit);
561+
let aligned_take = take - (take % valid_multiple);
562+
563+
if aligned_take < valid_multiple {
564+
break;
565+
}
566+
567+
decode_in_chunks_to_buffer(
568+
supports_fast_decode_and_encode,
569+
&buffer[..aligned_take],
570+
decoded_buffer,
571+
)?;
572+
573+
write_to_output(decoded_buffer, output)?;
574+
575+
buffer.drain(..aligned_take);
576+
}
577+
578+
Ok(())
579+
}
561580
// End of helper functions
562581

563582
pub fn fast_decode(
@@ -569,22 +588,12 @@ pub mod fast_decode {
569588
const DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE: usize = 1_024;
570589

571590
let alphabet = supports_fast_decode_and_encode.alphabet();
572-
let decode_in_chunks_of_size = supports_fast_decode_and_encode.valid_decoding_multiple()
573-
* DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
591+
let alphabet_table = alphabet_lookup(alphabet);
592+
let valid_multiple = supports_fast_decode_and_encode.valid_decoding_multiple();
593+
let decode_in_chunks_of_size = valid_multiple * DECODE_IN_CHUNKS_OF_SIZE_MULTIPLE;
574594

575595
assert!(decode_in_chunks_of_size > 0);
576-
577-
// Note that it's not worth using "data-encoding"'s ignore functionality if `ignore_garbage` is true, because
578-
// "data-encoding"'s ignore functionality cannot discard non-ASCII bytes. The data has to be filtered before
579-
// passing it to "data-encoding", so there is no point in doing any filtering in "data-encoding". This also
580-
// allows execution to stay on the happy path in "data-encoding":
581-
// https://github.com/ia0/data-encoding/blob/4f42ad7ef242f6d243e4de90cd1b46a57690d00e/lib/src/lib.rs#L754-L756
582-
// It is also not worth using "data-encoding"'s ignore functionality when `ignore_garbage` is
583-
// false.
584-
// Note that the alphabet constants above already include the padding characters
585-
// TODO
586-
// Precompute this
587-
let table = alphabet_to_table(alphabet, ignore_garbage);
596+
assert!(valid_multiple > 0);
588597

589598
// Start of buffers
590599

@@ -595,35 +604,69 @@ pub mod fast_decode {
595604

596605
let mut buffer = Vec::with_capacity(decode_in_chunks_of_size);
597606

598-
input
599-
.iter()
600-
.filter(|ch| table[usize::from(**ch)])
601-
.for_each(|ch| {
602-
buffer.push(*ch);
603-
// How many bytes to steal from `read_buffer` to get
604-
// `leftover_buffer` to the right size
605-
if buffer.len() == decode_in_chunks_of_size {
606-
assert_eq!(decode_in_chunks_of_size, buffer.len());
607-
// Decode data in chunks, then place it in `decoded_buffer`
608-
decode_in_chunks_to_buffer(
609-
supports_fast_decode_and_encode,
610-
&buffer,
611-
&mut decoded_buffer,
612-
)
613-
.unwrap();
614-
// Write all data in `decoded_buffer` to `output`
615-
write_to_output(&mut decoded_buffer, output).unwrap();
616-
buffer.clear();
617-
}
618-
});
619-
// Cleanup
620-
// `input` has finished producing data, so the data remaining in the buffers needs to be decoded and printed
621-
{
622-
// Decode all remaining encoded bytes, placing them in `decoded_buffer`
623-
supports_fast_decode_and_encode.decode_into_vec(&buffer, &mut decoded_buffer)?;
607+
let supports_partial_decode = supports_fast_decode_and_encode.supports_partial_decode();
624608

625-
// Write all data in `decoded_buffer` to `output`
609+
for &byte in &input {
610+
if byte == b'\n' || byte == b'\r' {
611+
continue;
612+
}
613+
614+
if alphabet_table[usize::from(byte)] {
615+
buffer.push(byte);
616+
} else if ignore_garbage {
617+
continue;
618+
} else {
619+
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
620+
}
621+
622+
if supports_partial_decode {
623+
flush_ready_chunks(
624+
&mut buffer,
625+
decode_in_chunks_of_size,
626+
valid_multiple,
627+
supports_fast_decode_and_encode,
628+
&mut decoded_buffer,
629+
output,
630+
)?;
631+
} else if buffer.len() == decode_in_chunks_of_size {
632+
decode_in_chunks_to_buffer(
633+
supports_fast_decode_and_encode,
634+
&buffer,
635+
&mut decoded_buffer,
636+
)?;
637+
write_to_output(&mut decoded_buffer, output)?;
638+
buffer.clear();
639+
}
640+
}
641+
642+
if supports_partial_decode {
643+
flush_ready_chunks(
644+
&mut buffer,
645+
decode_in_chunks_of_size,
646+
valid_multiple,
647+
supports_fast_decode_and_encode,
648+
&mut decoded_buffer,
649+
output,
650+
)?;
651+
}
652+
653+
if !buffer.is_empty() {
654+
let mut owned_chunk: Option<Vec<u8>> = None;
655+
let mut had_invalid_tail = false;
656+
657+
if let Some(pad_result) = supports_fast_decode_and_encode.pad_remainder(&buffer) {
658+
had_invalid_tail = pad_result.had_invalid_tail;
659+
owned_chunk = Some(pad_result.chunk);
660+
}
661+
662+
let final_chunk = owned_chunk.as_deref().unwrap_or(&buffer);
663+
664+
supports_fast_decode_and_encode.decode_into_vec(final_chunk, &mut decoded_buffer)?;
626665
write_to_output(&mut decoded_buffer, output)?;
666+
667+
if had_invalid_tail {
668+
return Err(USimpleError::new(1, "error: invalid input".to_owned()));
669+
}
627670
}
628671

629672
Ok(())

src/uucore/src/lib/features/encoding.rs

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ impl EncodingWrapper {
214214
}
215215
}
216216

217+
pub struct PadResult {
218+
pub chunk: Vec<u8>,
219+
pub had_invalid_tail: bool,
220+
}
221+
217222
pub trait SupportsFastDecodeAndEncode {
218223
/// Returns the list of characters used by this encoding
219224
fn alphabet(&self) -> &'static [u8];
@@ -245,6 +250,19 @@ pub trait SupportsFastDecodeAndEncode {
245250
///
246251
/// The decoding performed by `fast_decode` depends on this number being correct.
247252
fn valid_decoding_multiple(&self) -> usize;
253+
254+
/// Whether the decoder can flush partial chunks (multiples of `valid_decoding_multiple`)
255+
/// before seeing the full input. Defaults to `false` for encodings that must consume the
256+
/// entire input (e.g. base58).
257+
fn supports_partial_decode(&self) -> bool {
258+
false
259+
}
260+
261+
/// Gives encoding-specific logic a chance to pad a trailing, non-empty remainder
262+
/// before the final decode attempt. The default implementation opts out.
263+
fn pad_remainder(&self, _remainder: &[u8]) -> Option<PadResult> {
264+
None
265+
}
248266
}
249267

250268
impl SupportsFastDecodeAndEncode for Base58Wrapper {
@@ -504,3 +522,80 @@ impl SupportsFastDecodeAndEncode for EncodingWrapper {
504522
self.unpadded_multiple
505523
}
506524
}
525+
526+
pub struct Base32Wrapper {
527+
inner: EncodingWrapper,
528+
}
529+
530+
impl Base32Wrapper {
531+
pub fn new(
532+
encoding: Encoding,
533+
valid_decoding_multiple: usize,
534+
unpadded_multiple: usize,
535+
alphabet: &'static [u8],
536+
) -> Self {
537+
Self {
538+
inner: EncodingWrapper::new(
539+
encoding,
540+
valid_decoding_multiple,
541+
unpadded_multiple,
542+
alphabet,
543+
),
544+
}
545+
}
546+
}
547+
548+
impl SupportsFastDecodeAndEncode for Base32Wrapper {
549+
fn alphabet(&self) -> &'static [u8] {
550+
self.inner.alphabet()
551+
}
552+
553+
fn decode_into_vec(&self, input: &[u8], output: &mut Vec<u8>) -> UResult<()> {
554+
self.inner.decode_into_vec(input, output)
555+
}
556+
557+
fn encode_to_vec_deque(&self, input: &[u8], output: &mut VecDeque<u8>) -> UResult<()> {
558+
self.inner.encode_to_vec_deque(input, output)
559+
}
560+
561+
fn unpadded_multiple(&self) -> usize {
562+
self.inner.unpadded_multiple()
563+
}
564+
565+
fn valid_decoding_multiple(&self) -> usize {
566+
self.inner.valid_decoding_multiple()
567+
}
568+
569+
fn pad_remainder(&self, remainder: &[u8]) -> Option<PadResult> {
570+
if remainder.is_empty() || remainder.contains(&b'=') {
571+
return None;
572+
}
573+
574+
const VALID_REMAINDERS: [usize; 4] = [2, 4, 5, 7];
575+
576+
let mut len = remainder.len();
577+
let mut trimmed = false;
578+
579+
while len > 0 && !VALID_REMAINDERS.contains(&len) {
580+
len -= 1;
581+
trimmed = true;
582+
}
583+
584+
if len == 0 {
585+
return None;
586+
}
587+
588+
let mut padded = remainder[..len].to_vec();
589+
let missing = self.valid_decoding_multiple() - padded.len();
590+
padded.extend(std::iter::repeat_n(b'=', missing));
591+
592+
Some(PadResult {
593+
chunk: padded,
594+
had_invalid_tail: trimmed,
595+
})
596+
}
597+
598+
fn supports_partial_decode(&self) -> bool {
599+
true
600+
}
601+
}

0 commit comments

Comments
 (0)