@@ -67,7 +67,7 @@ def initialize_model_and_tokenizer(model_name_or_path):
6767 parser .add_argument (
6868 "--model_name_or_path" , type = str , default = "meta-llama/Meta-Llama-3.1-8B-Instruct" , help = "model name or path"
6969 )
70- parser .add_argument ("--dtype" , type = str , default = "mx_fp4 " , choices = ["mx_fp4 " , "mx_fp8 " , "nv_fp2 " , "fp4_v2 " ], help = "data type" )
70+ parser .add_argument ("--dtype" , type = str , default = "MXFP4 " , choices = ["MXFP4 " , "MXFP8 " , "NVFP4 " , "NVFP4+" , "uNVFP4 " ], help = "data type" )
7171 parser .add_argument ("--quantize" , action = "store_true" , help = "whether to quantize model" )
7272 parser .add_argument ("--use_recipe" , action = "store_true" , help = "whether to use recipe to quantize model" )
7373 parser .add_argument ("--recipe_file" , type = str , default = "recipes/Meta-Llama-3.1-8B-Instruct_6bits.json" , help = "path of recipe file" )
@@ -80,13 +80,6 @@ def initialize_model_and_tokenizer(model_name_or_path):
8080 parser .add_argument ("--accuracy" , action = "store_true" , help = "accuracy measurement" )
8181 parser .add_argument ("--local_rank" , type = int , default = 0 , metavar = "N" , help = "Local process rank." )
8282 parser .add_argument ("--batch_size" , default = 32 , type = int , help = "batch size for accuracy evaluation." )
83- parser .add_argument (
84- "--mxfp8_mod_list" ,
85- type = str ,
86- nargs = "*" ,
87- default = [], # 默认值
88- help = "List of module names or patterns for MXFP8 quantization." ,
89- )
9083 parser .add_argument (
9184 "--tasks" ,
9285 type = str ,
@@ -109,6 +102,14 @@ def initialize_model_and_tokenizer(model_name_or_path):
109102 device = "hpu" if is_hpex_available () else "cuda"
110103
111104 if args .quantize :
105+ autoround_dtype_mapping = {
106+ "MXFP4" : "mx_fp4" ,
107+ "MXFP8" : "mx_fp8" ,
108+ "NVFP4" : "nv_fp4" ,
109+ "uNVFP4" : "fp4_v2" ,
110+ "NVFP4+" : "fp4_v2" ,
111+ }
112+ args .dtype = autoround_dtype_mapping [args .dtype ]
112113 if args .quant_lm_head :
113114 lm_head_config = {
114115 "group_size" : 32 if "mx" in args .dtype else 16 ,
@@ -155,11 +156,10 @@ def load_recipe_results(file_path):
155156 autoround .quantize ()
156157 model = autoround .model
157158
158- # set dtype to BF16 for HPU inference performance
159- model = model .to (torch .bfloat16 )
160- model = model .eval ().to (device )
161-
162159 if args .accuracy :
160+ # set dtype to BF16 for HPU inference performance
161+ model = model .to (torch .bfloat16 )
162+ model = model .eval ().to (device )
163163 if is_hpex_available ():
164164 # HPU needs padding to buckets for better performance
165165 # Generation tasks, such as gsm8k and mmlu-pro, may get OOM.
@@ -240,3 +240,12 @@ def load_recipe_results(file_path):
240240 for task_name , accu in all_accuracy .items ():
241241 print (f"Accuracy for { task_name } : { accu :.4f} " )
242242 print (f"Overall accuracy: { sum (all_accuracy .values ())/ len (all_accuracy ):.4f} " )
243+
244+ if args .save :
245+ if world_size > 1 :
246+ assert False , "model quantized with deepspeed tensor parallel is not supported to be saved."
247+ elif args .use_recipe :
248+ assert False , "model quantized with recipe is not supported to be saved."
249+ else :
250+ autoround .save_pretrained (args .save_path , format = "llm_compressor" )
251+ print (f"Quantized model is saved to { args .save_path } " )
0 commit comments