Skip to content

Commit 2111256

Browse files
committed
add tokenize_with_spans example, add more invariants, see #127
1 parent 982799a commit 2111256

File tree

4 files changed

+101
-19
lines changed

4 files changed

+101
-19
lines changed

.github/workflows/fuzz.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,10 @@ jobs:
2323
- name: Build AFL target
2424
working-directory: ./fuzz
2525
run: make setup-afl
26+
27+
- name: Build libfuzzer target
28+
working-directory: ./fuzz
29+
run: |
30+
rustup toolchain install nightly
31+
cargo install cargo-fuzz
32+
cargo +nightly fuzz build main_libfuzzer

examples/tokenize_with_spans.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
//! Let's you easily try out the tokenizer with e.g.
2+
//! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize_with_spans
3+
use html5gum::{IoReader, Tokenizer, DefaultEmitter};
4+
5+
fn main() {
6+
let emitter = DefaultEmitter::<usize>::new_with_span();
7+
for token in Tokenizer::new_with_emitter(IoReader::new(std::io::stdin().lock()), emitter).flatten() {
8+
println!("{:?}", token);
9+
}
10+
}

fuzz/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ stdout.
6060
* Run `FUZZ_BASIC=1 make -e afl-next` after fuzzing to get the next crash and
6161
run afl-tmin on it. It will print the testcase as JSON string to check back
6262
into e.g. a file in `tests/custom-html5lib-tests/`.
63+
* Run `FUZZ_BASIC=1 make -e afl-skip` to skip over one fuzzing result.
6364

6465
## cargo fuzz
6566

fuzz/src/testcase/span_invariants.rs

Lines changed: 83 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ use html5gum::{DefaultEmitter, Token, Tokenizer};
55
/// This fuzzer checks that:
66
/// 1. Spans have valid bounds (start <= end <= input.len())
77
/// 2. Spans point to correct content in the input
8+
/// 3. Spans are non-overlapping and ordered
9+
/// 4. Spans are non-empty for structural tokens (tags, comments, doctypes)
810
///
911
/// This would have caught the bug fixed in commit 505de5b where end tag positions
1012
/// were incorrectly tracked in naive state switching mode.
@@ -16,24 +18,34 @@ pub fn validate_span_invariants(input: &[u8]) {
1618

1719
let tokenizer = Tokenizer::new_with_emitter(input, emitter);
1820

21+
let mut last_end: Option<usize> = None;
22+
1923
for result in tokenizer {
2024
let token = match result {
2125
Ok(token) => token,
2226
Err(_) => continue, // Errors are expected, we're fuzzing
2327
};
2428

25-
validate_token_span(&token, input);
29+
validate_token_span(&token, input, &mut last_end);
2630
}
2731
}
2832

2933
/// Validates the span of a single token against the input.
30-
fn validate_token_span(token: &Token<usize>, input: &[u8]) {
34+
fn validate_token_span(token: &Token<usize>, input: &[u8], last_end: &mut Option<usize>) {
3135
match token {
3236
Token::StartTag(tag) => {
33-
validate_span(&tag.span, input, "StartTag");
37+
validate_span(&tag.span, input, "StartTag", last_end);
38+
39+
// Start tags must have non-empty spans
40+
assert!(
41+
tag.span.start < tag.span.end,
42+
"StartTag has empty span: {}..{}",
43+
tag.span.start,
44+
tag.span.end
45+
);
3446

3547
// Verify the span actually contains the tag
36-
if tag.span.start < tag.span.end && tag.span.end <= input.len() {
48+
if tag.span.end <= input.len() {
3749
let content = &input[tag.span.start..tag.span.end];
3850
// Start tags should begin with '<' and contain the tag name
3951
assert!(
@@ -58,26 +70,36 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
5870

5971
// Validate attribute value spans
6072
for (_attr_name, attr_value) in &tag.attributes {
61-
validate_span(&attr_value.span, input, "Attribute value");
73+
validate_span(&attr_value.span, input, "Attribute value", &mut None);
6274

6375
// Note: Attribute value spans may include the entire attribute declaration
6476
// (name="value") or just the value depending on implementation.
6577
// We just validate basic span invariants here.
6678
}
6779
}
6880
Token::EndTag(tag) => {
69-
validate_span(&tag.span, input, "EndTag");
81+
validate_span(&tag.span, input, "EndTag", last_end);
82+
83+
// End tags must have non-empty spans
84+
assert!(
85+
tag.span.start < tag.span.end,
86+
"EndTag has empty span: {}..{} for tag '{}'",
87+
tag.span.start,
88+
tag.span.end,
89+
String::from_utf8_lossy(&tag.name)
90+
);
7091

7192
// Verify the span actually contains the end tag
72-
if tag.span.start < tag.span.end && tag.span.end <= input.len() {
93+
if tag.span.end <= input.len() {
7394
let content = &input[tag.span.start..tag.span.end];
7495
// End tags should start with '</'
7596
assert!(
7697
content.starts_with(b"</"),
77-
"EndTag span does not start with '</': {:?} at {}..{}",
98+
"EndTag span does not start with '</': {:?} at {}..{} for tag '{}'",
7899
String::from_utf8_lossy(content),
79100
tag.span.start,
80-
tag.span.end
101+
tag.span.end,
102+
String::from_utf8_lossy(&tag.name)
81103
);
82104
// The tag name should appear in the content
83105
assert!(
@@ -93,33 +115,50 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
93115
}
94116
}
95117
Token::String(s) => {
96-
validate_span(&s.span, input, "String");
118+
validate_span(&s.span, input, "String", last_end);
97119

98120
// Note: String token values may differ from raw span content due to
99121
// HTML entity decoding or character reference processing.
100122
// The key invariant is that the span points to valid input bounds.
123+
// Strings can have empty spans (e.g., empty text nodes).
101124
}
102125
Token::Comment(c) => {
103-
validate_span(&c.span, input, "Comment");
126+
validate_span(&c.span, input, "Comment", last_end);
127+
128+
// Comments must have non-empty spans
129+
assert!(
130+
c.span.start < c.span.end,
131+
"Comment has empty span: {}..{}",
132+
c.span.start,
133+
c.span.end
134+
);
104135

105136
// Verify comment span contains the comment markers and content
106-
if c.span.start < c.span.end && c.span.end <= input.len() {
137+
if c.span.end <= input.len() {
107138
let content = &input[c.span.start..c.span.end];
108-
// Comments should start with '<!--'
139+
// Comments should start with '<!' (covers both '<!--' and bogus comments)
109140
assert!(
110-
content.starts_with(b"<!--"),
111-
"Comment span does not start with '<!--': {:?} at {}..{}",
141+
content.starts_with(b"<!"),
142+
"Comment span does not start with '<!': {:?} at {}..{}",
112143
String::from_utf8_lossy(content),
113144
c.span.start,
114145
c.span.end
115146
);
116147
}
117148
}
118149
Token::Doctype(d) => {
119-
validate_span(&d.span, input, "Doctype");
150+
validate_span(&d.span, input, "Doctype", last_end);
151+
152+
// Doctypes must have non-empty spans
153+
assert!(
154+
d.span.start < d.span.end,
155+
"Doctype has empty span: {}..{}",
156+
d.span.start,
157+
d.span.end
158+
);
120159

121160
// Verify doctype span starts with '<!DOCTYPE'
122-
if d.span.start < d.span.end && d.span.end <= input.len() {
161+
if d.span.end <= input.len() {
123162
let content = &input[d.span.start..d.span.end];
124163
assert!(
125164
content.starts_with(b"<!") || content.starts_with(b"<!DOCTYPE"),
@@ -131,13 +170,19 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
131170
}
132171
}
133172
Token::Error(e) => {
134-
validate_span(&e.span, input, "Error");
173+
validate_span(&e.span, input, "Error", last_end);
174+
// Errors can have empty spans (they may point to a position rather than a range)
135175
}
136176
}
137177
}
138178

139179
/// Validates basic span invariants.
140-
fn validate_span(span: &html5gum::Span<usize>, input: &[u8], token_type: &str) {
180+
fn validate_span(
181+
span: &html5gum::Span<usize>,
182+
input: &[u8],
183+
token_type: &str,
184+
last_end: &mut Option<usize>,
185+
) {
141186
// Invariant 1: start <= end
142187
assert!(
143188
span.start <= span.end,
@@ -156,4 +201,23 @@ fn validate_span(span: &html5gum::Span<usize>, input: &[u8], token_type: &str) {
156201
span.end,
157202
input.len()
158203
);
204+
205+
// Invariant 3: Spans should be ordered (non-decreasing start positions)
206+
// However, error tokens can be interleaved and may have empty spans pointing to
207+
// positions within other tokens, so we only enforce ordering for non-empty spans
208+
if span.start < span.end {
209+
// Only check ordering for non-empty spans
210+
if let Some(prev_end) = last_end {
211+
assert!(
212+
span.start >= *prev_end,
213+
"{} span starts before previous span ended: current {}..{}, previous ended at {}",
214+
token_type,
215+
span.start,
216+
span.end,
217+
prev_end
218+
);
219+
}
220+
// Update last_end only for non-empty spans
221+
*last_end = Some(span.end);
222+
}
159223
}

0 commit comments

Comments
 (0)