3535 SystemContent ,
3636 ToolDescription ,
3737 load_harmony_encoding ,
38+ load_harmony_encoding_from_file ,
3839)
3940from pydantic import ValidationError
4041
@@ -949,3 +950,67 @@ def test_streamable_parser_tool_call_with_constrain_adjacent():
949950 ]
950951
951952 assert parser .messages == expected
953+
954+
955+ def test_load_harmony_encoding_from_file (tmp_path ):
956+ import os
957+ from openai_harmony import load_harmony_encoding_from_file
958+
959+ cache_dir = os .environ .get ("TIKTOKEN_RS_CACHE_DIR" )
960+ if not cache_dir :
961+ cache_dir = os .path .join (os .path .expanduser ("~" ), ".cache" , "tiktoken-rs-cache" )
962+ import hashlib
963+ url = "https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken"
964+ cache_key = hashlib .sha1 (url .encode ()).hexdigest ()
965+ vocab_file = os .path .join (cache_dir , cache_key )
966+ if not os .path .exists (vocab_file ):
967+ import pytest
968+ pytest .skip ("No local vocab file available for offline test" )
969+
970+ special_tokens = [
971+ ("<|startoftext|>" , 199998 ),
972+ ("<|endoftext|>" , 199999 ),
973+ ("<|reserved_200000|>" , 200000 ),
974+ ("<|reserved_200001|>" , 200001 ),
975+ ("<|return|>" , 200002 ),
976+ ("<|constrain|>" , 200003 ),
977+ ("<|reserved_200004|>" , 200004 ),
978+ ("<|channel|>" , 200005 ),
979+ ("<|start|>" , 200006 ),
980+ ("<|end|>" , 200007 ),
981+ ("<|message|>" , 200008 ),
982+ ("<|reserved_200009|>" , 200009 ),
983+ ("<|reserved_200010|>" , 200010 ),
984+ ("<|reserved_200011|>" , 200011 ),
985+ ("<|call|>" , 200012 ),
986+ ("<|reserved_200013|>" , 200013 ),
987+ ]
988+ pattern = "|" .join ([
989+ "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]*[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
990+ "[^\\ r\\ n\\ p{L}\\ p{N}]?[\\ p{Lu}\\ p{Lt}\\ p{Lm}\\ p{Lo}\\ p{M}]+[\\ p{Ll}\\ p{Lm}\\ p{Lo}\\ p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?" ,
991+ "\\ p{N}{1,3}" ,
992+ " ?[^\\ s\\ p{L}\\ p{N}]+[\\ r\\ n/]*" ,
993+ "\\ s*[\\ r\\ n]+" ,
994+ "\\ s+(?!\\ S)" ,
995+ "\\ s+" ,
996+ ])
997+ n_ctx = 8192
998+ max_message_tokens = 4096
999+ max_action_length = 256
1000+ expected_hash = "446a9538cb6c348e3516120d7c08b09f57c36495e2acfffe59a5bf8b0cfb1a2d"
1001+
1002+ encoding = load_harmony_encoding_from_file (
1003+ name = "test_local" ,
1004+ vocab_file = vocab_file ,
1005+ special_tokens = special_tokens ,
1006+ pattern = pattern ,
1007+ n_ctx = n_ctx ,
1008+ max_message_tokens = max_message_tokens ,
1009+ max_action_length = max_action_length ,
1010+ expected_hash = expected_hash ,
1011+ )
1012+
1013+ text = "Hello world!"
1014+ tokens = encoding .encode (text )
1015+ decoded = encoding .decode (tokens )
1016+ assert decoded .startswith ("Hello world" )
0 commit comments