1+ # Copyright 2025 Google LLC
2+ #
3+ # Licensed under the Apache License, Version 2.0 (the "License");
4+ # you may not use this file except in compliance with the License.
5+ # You may obtain a copy of the License at
6+ #
7+ # http://www.apache.org/licenses/LICENSE-2.0
8+ #
9+ # Unless required by applicable law or agreed to in writing, software
10+ # distributed under the License is distributed on an "AS IS" BASIS,
11+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+ # See the License for the specific language governing permissions and
13+ # limitations under the License.
14+
15+ import os
16+ from datasets import load_dataset
17+ from transformers import AutoModelForCausalLM , AutoTokenizer
18+
19+ from llmcompressor import oneshot
20+ from llmcompressor .modifiers .quantization import GPTQModifier
21+ from llmcompressor .utils import dispatch_for_generation
22+
23+ # Select model and load it.
24+ model_id = os .environ ["MODEL_ID" ]
25+ model = AutoModelForCausalLM .from_pretrained (model_id , torch_dtype = "auto" )
26+ tokenizer = AutoTokenizer .from_pretrained (model_id )
27+
28+ # Select calibration dataset.
29+ DATASET_ID = "HuggingFaceH4/ultrachat_200k"
30+ DATASET_SPLIT = "train_sft"
31+
32+ # Select number of samples. 512 samples is a good place to start.
33+ # Increasing the number of samples can improve accuracy.
34+ NUM_CALIBRATION_SAMPLES = 512
35+ MAX_SEQUENCE_LENGTH = 2048
36+
37+ # Load dataset and preprocess.
38+ ds = load_dataset (DATASET_ID , split = f"{ DATASET_SPLIT } [:{ NUM_CALIBRATION_SAMPLES } ]" )
39+ ds = ds .shuffle (seed = 42 )
40+
41+
42+ def preprocess (example ):
43+ return {
44+ "text" : tokenizer .apply_chat_template (
45+ example ["messages" ],
46+ tokenize = False ,
47+ )
48+ }
49+
50+
51+ ds = ds .map (preprocess )
52+
53+
54+ # Tokenize inputs.
55+ def tokenize (sample ):
56+ return tokenizer (
57+ sample ["text" ],
58+ padding = False ,
59+ max_length = MAX_SEQUENCE_LENGTH ,
60+ truncation = True ,
61+ add_special_tokens = False ,
62+ )
63+
64+
65+ ds = ds .map (tokenize , remove_columns = ds .column_names )
66+
67+ # Configure the quantization algorithm to run.
68+ # * quantize the weights to 4 bit with GPTQ with a group size 128
69+ recipe = GPTQModifier (targets = "Linear" , scheme = "W4A16" , ignore = ["lm_head" ])
70+
71+ # Apply algorithms.
72+ oneshot (
73+ model = model ,
74+ dataset = ds ,
75+ recipe = recipe ,
76+ max_seq_length = MAX_SEQUENCE_LENGTH ,
77+ num_calibration_samples = NUM_CALIBRATION_SAMPLES ,
78+ )
79+
80+ # Confirm generations of the quantized model look sane.
81+ print ("\n \n " )
82+ print ("========== SAMPLE GENERATION ==============" )
83+ dispatch_for_generation (model )
84+ sample = tokenizer ("Hello my name is" , return_tensors = "pt" )
85+ sample = {key : value .to (model .device ) for key , value in sample .items ()}
86+ output = model .generate (** sample , max_new_tokens = 100 )
87+ print (tokenizer .decode (output [0 ]))
88+ print ("==========================================\n \n " )
89+
90+ # Save to disk compressed.
91+ SAVE_DIR = model_id .rstrip ("/" ).split ("/" )[- 1 ] + "-W4A16-G128"
92+ model .save_pretrained (SAVE_DIR , save_compressed = True )
93+ tokenizer .save_pretrained (SAVE_DIR )
94+
95+ model .push_to_hub (SAVE_DIR )
96+ tokenizer .push_to_hub (SAVE_DIR )
0 commit comments