1+ from transformers import AutoTokenizer , PreTrainedTokenizerFast
2+ from transformers .tokenization_utils_base import AddedToken
3+ from http .server import HTTPServer , BaseHTTPRequestHandler
4+ import json
5+ import argparse
6+
7+ class Tokenizer_Http :
8+ def __init__ (self , model_id , system_content = "You are a helpful assistant." ):
9+ self .tokenizer = AutoTokenizer .from_pretrained (
10+ model_id ,
11+ trust_remote_code = True ,
12+ use_fast = False
13+ )
14+ self .token_ids_cache = []
15+ self .system_content = system_content
16+
17+ def encode (self , content ):
18+ text = [
19+ f'<|im_start|>system\n { self .system_content } <|im_end|>\n '
20+ f'<|im_start|>user\n { content } <|im_end|>\n '
21+ f'<|im_start|>assistant\n '
22+ ]
23+ input_ids = self .tokenizer (text )
24+ return input_ids ["input_ids" ][0 ]
25+
26+ def encode_vpm_image (self , content = "Describe this image." , num_img = 1 , img_token_num = 256 ):
27+ imgs_token = (
28+ '<|vision_start|>'
29+ + '<|image_pad|>' * img_token_num
30+ + '<|vision_end|>'
31+ )
32+ imgs_token *= num_img
33+ text = (
34+ f'<|im_start|>system\n { self .system_content } <|im_end|>\n '
35+ f'<|im_start|>user\n { imgs_token } { content } <|im_end|>\n '
36+ f'<|im_start|>assistant\n '
37+ )
38+ text_inputs = self .tokenizer ([text ])
39+ return text_inputs ["input_ids" ][0 ]
40+
41+ def encode_vpm_video (self , content = "Describe this image." , num_img = 1 , img_token_num = 256 ):
42+ imgs_token = (
43+ '<|vision_start|>'
44+ + '<|video_pad|>' * img_token_num * num_img
45+ + '<|vision_end|>'
46+ )
47+ text = (
48+ f'<|im_start|>system\n You are a helpful assistant.<|im_end|>\n '
49+ f'<|im_start|>user\n { imgs_token } { content } <|im_end|>\n '
50+ f'<|im_start|>assistant\n '
51+ )
52+ text_inputs = self .tokenizer ([text ])
53+ return text_inputs ["input_ids" ][0 ]
54+
55+ def decode (self , token_ids ):
56+ self .token_ids_cache += token_ids
57+ text = self .tokenizer .decode (self .token_ids_cache )
58+ if "\ufffd " in text :
59+ print ("text 中包含非法字符" )
60+ return ""
61+ else :
62+ self .token_ids_cache .clear ()
63+ return text
64+
65+ @property
66+ def bos_id (self ):
67+ return self .tokenizer .bos_token_id
68+
69+ @property
70+ def eos_id (self ):
71+ return self .tokenizer .eos_token_id
72+
73+ @property
74+ def bos_token (self ):
75+ return self .tokenizer .bos_token
76+
77+ @property
78+ def eos_token (self ):
79+ return self .tokenizer .eos_token
80+
81+ @property
82+ def img_start_token (self ):
83+ return self .tokenizer .encode ("<|vision_start|>" )[0 ]
84+
85+ @property
86+ def img_context_token (self ):
87+ return self .tokenizer .encode ("<|image_pad|>" )[0 ]
88+
89+ class Request (BaseHTTPRequestHandler ):
90+ timeout = 5
91+ server_version = 'Apache'
92+
93+ def do_GET (self ):
94+ print (self .path )
95+ self .send_response (200 )
96+ self .send_header ("type" , "get" )
97+ self .end_headers ()
98+ if self .path == '/bos_id' :
99+ bos_id = tokenizer .bos_id
100+ msg = json .dumps ({'bos_id' : - 1 if bos_id is None else bos_id })
101+ elif self .path == '/eos_id' :
102+ eos_id = tokenizer .eos_id
103+ msg = json .dumps ({'eos_id' : - 1 if eos_id is None else eos_id })
104+ elif self .path == '/img_start_token' :
105+ img_start_token = tokenizer .img_start_token
106+ msg = json .dumps ({'img_start_token' : - 1 if img_start_token is None else img_start_token })
107+ elif self .path == '/img_context_token' :
108+ img_context_token = tokenizer .img_context_token
109+ msg = json .dumps ({'img_context_token' : - 1 if img_context_token is None else img_context_token })
110+ else :
111+ msg = 'error'
112+ print (msg )
113+ msg = str (msg ).encode ()
114+ self .wfile .write (msg )
115+
116+ def do_POST (self ):
117+ data = self .rfile .read (int (self .headers ['content-length' ]))
118+ req = json .loads (data .decode ())
119+ if self .path == "/encode" :
120+ prompt = req ['text' ]
121+ b_img_prompt = req .get ('img_prompt' , False )
122+ img_type = req .get ('img_type' , 'image' )
123+ if b_img_prompt :
124+ if img_type == 'image' :
125+ token_ids = tokenizer .encode_vpm_image (
126+ prompt ,
127+ req .get ("num_img" , 1 ),
128+ req .get ("img_token_num" , 256 )
129+ )
130+ elif img_type == 'video' :
131+ token_ids = tokenizer .encode_vpm_video (
132+ prompt ,
133+ req .get ("num_img" , 1 ),
134+ req .get ("img_token_num" , 256 )
135+ )
136+ else :
137+ token_ids = tokenizer .encode (prompt )
138+ else :
139+ token_ids = tokenizer .encode (prompt )
140+ msg = json .dumps ({'token_ids' : - 1 if token_ids is None else token_ids })
141+ elif self .path == "/decode" :
142+ token_ids = req ['token_ids' ]
143+ text = tokenizer .decode (token_ids )
144+ msg = json .dumps ({'text' : "" if text is None else text })
145+ else :
146+ msg = 'error'
147+ self .send_response (200 )
148+ self .end_headers ()
149+ self .wfile .write (str (msg ).encode ())
150+
151+ if __name__ == "__main__" :
152+ args = argparse .ArgumentParser ()
153+ args .add_argument ('--host' , type = str , default = 'localhost' )
154+ args .add_argument ('--port' , type = int , default = 8080 )
155+ args .add_argument ('--model_id' , type = str , default = 'tokenizer' )
156+ args .add_argument ('--content' , type = str , default = 'You are a helpful assistant.' )
157+ args = args .parse_args ()
158+ tokenizer = Tokenizer_Http (args .model_id , system_content = args .content )
159+ host = (args .host , args .port )
160+ print (f"http://{ args .host } :{ args .port } " )
161+ server = HTTPServer (host , Request )
162+ server .serve_forever ()
0 commit comments