Replace invalid utf-8 sequences with the replacement character

bbrowning · bbrowning · commit ff86d6800e74 · 2025-10-13T18:04:23.000-04:00
Before, when parsing message content, if we came across any invalid utf-8 sequences, we'd forever accumulate them in `undecoded_tokens` and any subsequent content would get dropped when we eventually found our next stop token. Now, we detect invalid utf-8 sequences and replace them with the utf-8 replacement character '\uFFFD' and continue parsing further content. In real-world scenarios, sometimes invalid utf-8 sequences are being generated by gpt-oss models. This could be caused by too high temperature settings, prompts with extensive usage of utf-8 characters in unexpected ways that are outside the training datasets, or some combination of both. The net effect is that parsing will continue making forward progress after we hit an invalid utf-8 sequence, which is important for scenarios where inference servers are generating streaming long message contents and the users will expect those tokens to be streamed back as they're generated instead of buffered for long periods of time in our `StreamableParser`. See vllm-project/vllm#26480 for one such real-world scenario encountered in vLLM.
diff --git a/src/encoding.rs b/src/encoding.rs
@@ -1155,8 +1155,22 @@ impl StreamableParser {
                                 self.last_content_delta = Some(decoded);
                                 self.undecoded_tokens.clear();
                             }
-                            Err(_) => {
-                                self.last_content_delta = None;
+                            Err(e) => {
+                                match e.error_len {
+                                    // We're trying to decode a sequence of tokens that is not valid utf-8.
+                                    // Replace the invalid sequence with a single utf-8 replacement character.
+                                    Some(_) => {
+                                        let replacement = '\u{FFFD}'.to_string();
+                                        self.encoding
+                                            .render_text_into(&replacement, content_tokens)?;
+                                        self.last_content_delta = Some(replacement);
+                                        self.undecoded_tokens.clear();
+                                    }
+                                    None => {
+                                        // waiting on next byte in our utf-8 sequence
+                                        self.last_content_delta = None;
+                                    }
+                                }
                             }
                         }
                         // this was not an EOS
diff --git a/src/tiktoken.rs b/src/tiktoken.rs
@@ -148,6 +148,7 @@ impl std::error::Error for DecodeKeyError {}
 #[derive(Debug, Clone)]
 pub struct DecodeError {
     pub message: String,
+    pub error_len: Option<usize>,
 }
 
 impl std::fmt::Display for DecodeError {
@@ -212,9 +213,11 @@ impl CoreBPE {
     {
         let bytes = self.decode_bytes(tokens).map_err(|e| DecodeError {
             message: format!("Invalid token error: {e}"),
+            error_len: None,
         })?;
         String::from_utf8(bytes).map_err(|e| DecodeError {
             message: format!("Invalid utf-8 sequence: {e}"),
+            error_len: e.utf8_error().error_len(),
         })
     }
 
diff --git a/tests/test_harmony.py b/tests/test_harmony.py
@@ -981,3 +981,19 @@ def test_streamable_parser_tool_call_with_constrain_adjacent():
     ]
 
     assert parser.messages == expected
+
+
+def test_streamable_parser_invalid_utf8_decoding():
+    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
+
+    # The sequence of two "9552" tokens in a row is not valid utf-8.
+    # Each pair of those will give us one utf-8 replacement character.
+    tokens = [200006, 173781, 200008, 9552, 9552, 9552, 9552, 135596, 200007]
+    parser = StreamableParser(encoding, None)
+    for token in tokens:
+        parser.process(token)
+
+    expected = [
+        Message.from_role_and_content(Role.ASSISTANT, "\uFFFD\uFFFDworked"),
+    ]
+    assert parser.messages == expected

Original file line number	Diff line number	Diff line change
`@@ -148,6 +148,7 @@ impl std::error::Error for DecodeKeyError {}`
`148`	`148`	`#[derive(Debug, Clone)]`
`149`	`149`	`pub struct DecodeError {`
`150`	`150`	`pub message: String,`
	`151`	`+ pub error_len: Option<usize>,`
`151`	`152`	`}`
`152`	`153`
`153`	`154`	`impl std::fmt::Display for DecodeError {`
`@@ -212,9 +213,11 @@ impl CoreBPE {`
`212`	`213`	`{`
`213`	`214`	`let bytes = self.decode_bytes(tokens).map_err(\|e\| DecodeError {`
`214`	`215`	`message: format!("Invalid token error: {e}"),`
	`216`	`+ error_len: None,`
`215`	`217`	`})?;`
`216`	`218`	`String::from_utf8(bytes).map_err(\|e\| DecodeError {`
`217`	`219`	`message: format!("Invalid utf-8 sequence: {e}"),`
	`220`	`+ error_len: e.utf8_error().error_len(),`
`218`	`221`	`})`
`219`	`222`	`}`
`220`	`223`