add tokenize_with_spans example, add more invariants, see #127

untitaker · untitaker · commit 2111256fb396 · 2025-11-27T23:05:49.000+01:00
diff --git a/.github/workflows/fuzz.yml b/.github/workflows/fuzz.yml
@@ -23,3 +23,10 @@ jobs:
       - name: Build AFL target
         working-directory: ./fuzz
         run: make setup-afl
+
+      - name: Build libfuzzer target
+        working-directory: ./fuzz
+        run: |
+          rustup toolchain install nightly
+          cargo install cargo-fuzz
+          cargo +nightly fuzz build main_libfuzzer
diff --git a/examples/tokenize_with_spans.rs b/examples/tokenize_with_spans.rs
@@ -0,0 +1,10 @@
+//! Let's you easily try out the tokenizer with e.g.
+//! printf '<h1>Hello world!</h1>' | cargo run --example=tokenize_with_spans
+use html5gum::{IoReader, Tokenizer, DefaultEmitter};
+
+fn main() {
+    let emitter = DefaultEmitter::<usize>::new_with_span();
+    for token in Tokenizer::new_with_emitter(IoReader::new(std::io::stdin().lock()), emitter).flatten() {
+        println!("{:?}", token);
+    }
+}
diff --git a/fuzz/README.md b/fuzz/README.md
@@ -60,6 +60,7 @@ stdout.
 * Run `FUZZ_BASIC=1 make -e afl-next` after fuzzing to get the next crash and
   run afl-tmin on it. It will print the testcase as JSON string to check back
   into e.g. a file in `tests/custom-html5lib-tests/`.
+* Run `FUZZ_BASIC=1 make -e afl-skip` to skip over one fuzzing result.
 
 ## cargo fuzz
 
diff --git a/fuzz/src/testcase/span_invariants.rs b/fuzz/src/testcase/span_invariants.rs
@@ -5,6 +5,8 @@ use html5gum::{DefaultEmitter, Token, Tokenizer};
 /// This fuzzer checks that:
 /// 1. Spans have valid bounds (start <= end <= input.len())
 /// 2. Spans point to correct content in the input
+/// 3. Spans are non-overlapping and ordered
+/// 4. Spans are non-empty for structural tokens (tags, comments, doctypes)
 ///
 /// This would have caught the bug fixed in commit 505de5b where end tag positions
 /// were incorrectly tracked in naive state switching mode.
@@ -16,24 +18,34 @@ pub fn validate_span_invariants(input: &[u8]) {
 
     let tokenizer = Tokenizer::new_with_emitter(input, emitter);
 
+    let mut last_end: Option<usize> = None;
+
     for result in tokenizer {
         let token = match result {
             Ok(token) => token,
             Err(_) => continue, // Errors are expected, we're fuzzing
         };
 
-        validate_token_span(&token, input);
+        validate_token_span(&token, input, &mut last_end);
     }
 }
 
 /// Validates the span of a single token against the input.
-fn validate_token_span(token: &Token<usize>, input: &[u8]) {
+fn validate_token_span(token: &Token<usize>, input: &[u8], last_end: &mut Option<usize>) {
     match token {
         Token::StartTag(tag) => {
-            validate_span(&tag.span, input, "StartTag");
+            validate_span(&tag.span, input, "StartTag", last_end);
+
+            // Start tags must have non-empty spans
+            assert!(
+                tag.span.start < tag.span.end,
+                "StartTag has empty span: {}..{}",
+                tag.span.start,
+                tag.span.end
+            );
 
             // Verify the span actually contains the tag
-            if tag.span.start < tag.span.end && tag.span.end <= input.len() {
+            if tag.span.end <= input.len() {
                 let content = &input[tag.span.start..tag.span.end];
                 // Start tags should begin with '<' and contain the tag name
                 assert!(
@@ -58,26 +70,36 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
 
             // Validate attribute value spans
             for (_attr_name, attr_value) in &tag.attributes {
-                validate_span(&attr_value.span, input, "Attribute value");
+                validate_span(&attr_value.span, input, "Attribute value", &mut None);
 
                 // Note: Attribute value spans may include the entire attribute declaration
                 // (name="value") or just the value depending on implementation.
                 // We just validate basic span invariants here.
             }
         }
         Token::EndTag(tag) => {
-            validate_span(&tag.span, input, "EndTag");
+            validate_span(&tag.span, input, "EndTag", last_end);
+
+            // End tags must have non-empty spans
+            assert!(
+                tag.span.start < tag.span.end,
+                "EndTag has empty span: {}..{} for tag '{}'",
+                tag.span.start,
+                tag.span.end,
+                String::from_utf8_lossy(&tag.name)
+            );
 
             // Verify the span actually contains the end tag
-            if tag.span.start < tag.span.end && tag.span.end <= input.len() {
+            if tag.span.end <= input.len() {
                 let content = &input[tag.span.start..tag.span.end];
                 // End tags should start with '</'
                 assert!(
                     content.starts_with(b"</"),
-                    "EndTag span does not start with '</': {:?} at {}..{}",
+                    "EndTag span does not start with '</': {:?} at {}..{} for tag '{}'",
                     String::from_utf8_lossy(content),
                     tag.span.start,
-                    tag.span.end
+                    tag.span.end,
+                    String::from_utf8_lossy(&tag.name)
                 );
                 // The tag name should appear in the content
                 assert!(
@@ -93,33 +115,50 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
             }
         }
         Token::String(s) => {
-            validate_span(&s.span, input, "String");
+            validate_span(&s.span, input, "String", last_end);
 
             // Note: String token values may differ from raw span content due to
             // HTML entity decoding or character reference processing.
             // The key invariant is that the span points to valid input bounds.
+            // Strings can have empty spans (e.g., empty text nodes).
         }
         Token::Comment(c) => {
-            validate_span(&c.span, input, "Comment");
+            validate_span(&c.span, input, "Comment", last_end);
+
+            // Comments must have non-empty spans
+            assert!(
+                c.span.start < c.span.end,
+                "Comment has empty span: {}..{}",
+                c.span.start,
+                c.span.end
+            );
 
             // Verify comment span contains the comment markers and content
-            if c.span.start < c.span.end && c.span.end <= input.len() {
+            if c.span.end <= input.len() {
                 let content = &input[c.span.start..c.span.end];
-                // Comments should start with '<!--'
+                // Comments should start with '<!' (covers both '<!--' and bogus comments)
                 assert!(
-                    content.starts_with(b"<!--"),
-                    "Comment span does not start with '<!--': {:?} at {}..{}",
+                    content.starts_with(b"<!"),
+                    "Comment span does not start with '<!': {:?} at {}..{}",
                     String::from_utf8_lossy(content),
                     c.span.start,
                     c.span.end
                 );
             }
         }
         Token::Doctype(d) => {
-            validate_span(&d.span, input, "Doctype");
+            validate_span(&d.span, input, "Doctype", last_end);
+
+            // Doctypes must have non-empty spans
+            assert!(
+                d.span.start < d.span.end,
+                "Doctype has empty span: {}..{}",
+                d.span.start,
+                d.span.end
+            );
 
             // Verify doctype span starts with '<!DOCTYPE'
-            if d.span.start < d.span.end && d.span.end <= input.len() {
+            if d.span.end <= input.len() {
                 let content = &input[d.span.start..d.span.end];
                 assert!(
                     content.starts_with(b"<!") || content.starts_with(b"<!DOCTYPE"),
@@ -131,13 +170,19 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
             }
         }
         Token::Error(e) => {
-            validate_span(&e.span, input, "Error");
+            validate_span(&e.span, input, "Error", last_end);
+            // Errors can have empty spans (they may point to a position rather than a range)
         }
     }
 }
 
 /// Validates basic span invariants.
-fn validate_span(span: &html5gum::Span<usize>, input: &[u8], token_type: &str) {
+fn validate_span(
+    span: &html5gum::Span<usize>,
+    input: &[u8],
+    token_type: &str,
+    last_end: &mut Option<usize>,
+) {
     // Invariant 1: start <= end
     assert!(
         span.start <= span.end,
@@ -156,4 +201,23 @@ fn validate_span(span: &html5gum::Span<usize>, input: &[u8], token_type: &str) {
         span.end,
         input.len()
     );
+
+    // Invariant 3: Spans should be ordered (non-decreasing start positions)
+    // However, error tokens can be interleaved and may have empty spans pointing to
+    // positions within other tokens, so we only enforce ordering for non-empty spans
+    if span.start < span.end {
+        // Only check ordering for non-empty spans
+        if let Some(prev_end) = last_end {
+            assert!(
+                span.start >= *prev_end,
+                "{} span starts before previous span ended: current {}..{}, previous ended at {}",
+                token_type,
+                span.start,
+                span.end,
+                prev_end
+            );
+        }
+        // Update last_end only for non-empty spans
+        *last_end = Some(span.end);
+    }
 }