@@ -5,6 +5,8 @@ use html5gum::{DefaultEmitter, Token, Tokenizer};
55/// This fuzzer checks that:
66/// 1. Spans have valid bounds (start <= end <= input.len())
77/// 2. Spans point to correct content in the input
8+ /// 3. Spans are non-overlapping and ordered
9+ /// 4. Spans are non-empty for structural tokens (tags, comments, doctypes)
810///
911/// This would have caught the bug fixed in commit 505de5b where end tag positions
1012/// were incorrectly tracked in naive state switching mode.
@@ -16,24 +18,34 @@ pub fn validate_span_invariants(input: &[u8]) {
1618
1719 let tokenizer = Tokenizer :: new_with_emitter ( input, emitter) ;
1820
21+ let mut last_end: Option < usize > = None ;
22+
1923 for result in tokenizer {
2024 let token = match result {
2125 Ok ( token) => token,
2226 Err ( _) => continue , // Errors are expected, we're fuzzing
2327 } ;
2428
25- validate_token_span ( & token, input) ;
29+ validate_token_span ( & token, input, & mut last_end ) ;
2630 }
2731}
2832
2933/// Validates the span of a single token against the input.
30- fn validate_token_span ( token : & Token < usize > , input : & [ u8 ] ) {
34+ fn validate_token_span ( token : & Token < usize > , input : & [ u8 ] , last_end : & mut Option < usize > ) {
3135 match token {
3236 Token :: StartTag ( tag) => {
33- validate_span ( & tag. span , input, "StartTag" ) ;
37+ validate_span ( & tag. span , input, "StartTag" , last_end) ;
38+
39+ // Start tags must have non-empty spans
40+ assert ! (
41+ tag. span. start < tag. span. end,
42+ "StartTag has empty span: {}..{}" ,
43+ tag. span. start,
44+ tag. span. end
45+ ) ;
3446
3547 // Verify the span actually contains the tag
36- if tag. span . start < tag . span . end && tag . span . end <= input. len ( ) {
48+ if tag. span . end <= input. len ( ) {
3749 let content = & input[ tag. span . start ..tag. span . end ] ;
3850 // Start tags should begin with '<' and contain the tag name
3951 assert ! (
@@ -58,26 +70,36 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
5870
5971 // Validate attribute value spans
6072 for ( _attr_name, attr_value) in & tag. attributes {
61- validate_span ( & attr_value. span , input, "Attribute value" ) ;
73+ validate_span ( & attr_value. span , input, "Attribute value" , & mut None ) ;
6274
6375 // Note: Attribute value spans may include the entire attribute declaration
6476 // (name="value") or just the value depending on implementation.
6577 // We just validate basic span invariants here.
6678 }
6779 }
6880 Token :: EndTag ( tag) => {
69- validate_span ( & tag. span , input, "EndTag" ) ;
81+ validate_span ( & tag. span , input, "EndTag" , last_end) ;
82+
83+ // End tags must have non-empty spans
84+ assert ! (
85+ tag. span. start < tag. span. end,
86+ "EndTag has empty span: {}..{} for tag '{}'" ,
87+ tag. span. start,
88+ tag. span. end,
89+ String :: from_utf8_lossy( & tag. name)
90+ ) ;
7091
7192 // Verify the span actually contains the end tag
72- if tag. span . start < tag . span . end && tag . span . end <= input. len ( ) {
93+ if tag. span . end <= input. len ( ) {
7394 let content = & input[ tag. span . start ..tag. span . end ] ;
7495 // End tags should start with '</'
7596 assert ! (
7697 content. starts_with( b"</" ) ,
77- "EndTag span does not start with '</': {:?} at {}..{}" ,
98+ "EndTag span does not start with '</': {:?} at {}..{} for tag '{}' " ,
7899 String :: from_utf8_lossy( content) ,
79100 tag. span. start,
80- tag. span. end
101+ tag. span. end,
102+ String :: from_utf8_lossy( & tag. name)
81103 ) ;
82104 // The tag name should appear in the content
83105 assert ! (
@@ -93,33 +115,50 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
93115 }
94116 }
95117 Token :: String ( s) => {
96- validate_span ( & s. span , input, "String" ) ;
118+ validate_span ( & s. span , input, "String" , last_end ) ;
97119
98120 // Note: String token values may differ from raw span content due to
99121 // HTML entity decoding or character reference processing.
100122 // The key invariant is that the span points to valid input bounds.
123+ // Strings can have empty spans (e.g., empty text nodes).
101124 }
102125 Token :: Comment ( c) => {
103- validate_span ( & c. span , input, "Comment" ) ;
126+ validate_span ( & c. span , input, "Comment" , last_end) ;
127+
128+ // Comments must have non-empty spans
129+ assert ! (
130+ c. span. start < c. span. end,
131+ "Comment has empty span: {}..{}" ,
132+ c. span. start,
133+ c. span. end
134+ ) ;
104135
105136 // Verify comment span contains the comment markers and content
106- if c. span . start < c . span . end && c . span . end <= input. len ( ) {
137+ if c. span . end <= input. len ( ) {
107138 let content = & input[ c. span . start ..c. span . end ] ;
108- // Comments should start with '<!--'
139+ // Comments should start with '<!' (covers both '<! --' and bogus comments)
109140 assert ! (
110- content. starts_with( b"<!-- " ) ,
111- "Comment span does not start with '<!-- ': {:?} at {}..{}" ,
141+ content. starts_with( b"<!" ) ,
142+ "Comment span does not start with '<!': {:?} at {}..{}" ,
112143 String :: from_utf8_lossy( content) ,
113144 c. span. start,
114145 c. span. end
115146 ) ;
116147 }
117148 }
118149 Token :: Doctype ( d) => {
119- validate_span ( & d. span , input, "Doctype" ) ;
150+ validate_span ( & d. span , input, "Doctype" , last_end) ;
151+
152+ // Doctypes must have non-empty spans
153+ assert ! (
154+ d. span. start < d. span. end,
155+ "Doctype has empty span: {}..{}" ,
156+ d. span. start,
157+ d. span. end
158+ ) ;
120159
121160 // Verify doctype span starts with '<!DOCTYPE'
122- if d. span . start < d . span . end && d . span . end <= input. len ( ) {
161+ if d. span . end <= input. len ( ) {
123162 let content = & input[ d. span . start ..d. span . end ] ;
124163 assert ! (
125164 content. starts_with( b"<!" ) || content. starts_with( b"<!DOCTYPE" ) ,
@@ -131,13 +170,19 @@ fn validate_token_span(token: &Token<usize>, input: &[u8]) {
131170 }
132171 }
133172 Token :: Error ( e) => {
134- validate_span ( & e. span , input, "Error" ) ;
173+ validate_span ( & e. span , input, "Error" , last_end) ;
174+ // Errors can have empty spans (they may point to a position rather than a range)
135175 }
136176 }
137177}
138178
139179/// Validates basic span invariants.
140- fn validate_span ( span : & html5gum:: Span < usize > , input : & [ u8 ] , token_type : & str ) {
180+ fn validate_span (
181+ span : & html5gum:: Span < usize > ,
182+ input : & [ u8 ] ,
183+ token_type : & str ,
184+ last_end : & mut Option < usize > ,
185+ ) {
141186 // Invariant 1: start <= end
142187 assert ! (
143188 span. start <= span. end,
@@ -156,4 +201,23 @@ fn validate_span(span: &html5gum::Span<usize>, input: &[u8], token_type: &str) {
156201 span. end,
157202 input. len( )
158203 ) ;
204+
205+ // Invariant 3: Spans should be ordered (non-decreasing start positions)
206+ // However, error tokens can be interleaved and may have empty spans pointing to
207+ // positions within other tokens, so we only enforce ordering for non-empty spans
208+ if span. start < span. end {
209+ // Only check ordering for non-empty spans
210+ if let Some ( prev_end) = last_end {
211+ assert ! (
212+ span. start >= * prev_end,
213+ "{} span starts before previous span ended: current {}..{}, previous ended at {}" ,
214+ token_type,
215+ span. start,
216+ span. end,
217+ prev_end
218+ ) ;
219+ }
220+ // Update last_end only for non-empty spans
221+ * last_end = Some ( span. end ) ;
222+ }
159223}
0 commit comments