update per review comments

xin3he · xin3he · commit bb0a9fa7a626 · 2025-09-24T04:08:32.000-04:00
Signed-off-by: He, Xin3 &lt;xin3.he@intel.com&gt;
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/README.md
@@ -16,10 +16,10 @@ pip install git+https://github.com/intel/auto-round.git@xinhe/llama_tmp
 ### Demo 
 
 ```bash
-python quantize.py  --model_name_or_path facebook/opt-125m --quantize --dtype mx_fp4 --batch_size 8 --accuracy
+python quantize.py  --model_name_or_path facebook/opt-125m --quantize --dtype MXFP4 --batch_size 8 --accuracy
 ```
 
-> Note: `--dtype` supports `mx_fp4`(MXFP4), `mx_fp8`(MXFP8), `nv_fp4`(NVFP4), `fp4_v2`(NVFP4+)
+> Note: `--dtype` supports `MXFP4`, `MXFP8`, `NVFP4`, `uNVFP4`
 
 ## Mix-precision Quantization (MXFP4 + MXFP8， Target bits: 6)
 
@@ -46,3 +46,21 @@ deepspeed --include="localhost:4,5,6,7" --master_port=29500 quantize.py  \
     --batch_size 32
 ```
 
+## vLLM usage
+NVFP4 is supported by vLLM already, the saved model in this example follows the `llm_compressor` format, please refer to the usage in the public vLLM document.
+
+MXFP4 is enabled in a forked repo, usages as below:
+```bash
+# Install the forked vLLM for MXFP4
+
+# Command to save model:
+python quantize.py  --model_name_or_path facebook/opt-125m --quantize --dtype MXFP4 --batch_size 8 --accuracy --save --save_path opt-125m-mxfp4
+
+# Command to evaluate with vLLM:
+
+```
+
+> Notes:
+> 1. model quantized with deepspeed tensor parallel is not supported to be saved.
+> 2. model quantized with recipe is not supported to be saved.
+
diff --git a/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py b/examples/3.x_api/pytorch/nlp/huggingface_models/language-modeling/quantization/mix-precision/quantize.py
@@ -67,7 +67,7 @@ def initialize_model_and_tokenizer(model_name_or_path):
     parser.add_argument(
         "--model_name_or_path", type=str, default="meta-llama/Meta-Llama-3.1-8B-Instruct", help="model name or path"
     )
-    parser.add_argument("--dtype", type=str, default="mx_fp4", choices=["mx_fp4", "mx_fp8", "nv_fp2", "fp4_v2"], help="data type")
+    parser.add_argument("--dtype", type=str, default="MXFP4", choices=["MXFP4", "MXFP8", "NVFP4", "NVFP4+", "uNVFP4"], help="data type")
     parser.add_argument("--quantize", action="store_true", help="whether to quantize model")
     parser.add_argument("--use_recipe", action="store_true", help="whether to use recipe to quantize model")
     parser.add_argument("--recipe_file", type=str, default="recipes/Meta-Llama-3.1-8B-Instruct_6bits.json", help="path of recipe file")
@@ -80,13 +80,6 @@ def initialize_model_and_tokenizer(model_name_or_path):
     parser.add_argument("--accuracy", action="store_true", help="accuracy measurement")
     parser.add_argument("--local_rank", type=int, default=0, metavar="N", help="Local process rank.")
     parser.add_argument("--batch_size", default=32, type=int, help="batch size for accuracy evaluation.")
-    parser.add_argument(
-        "--mxfp8_mod_list",
-        type=str,
-        nargs="*",
-        default=[],  # 默认值
-        help="List of module names or patterns for MXFP8 quantization.",
-    )
     parser.add_argument(
         "--tasks",
         type=str,
@@ -109,6 +102,14 @@ def initialize_model_and_tokenizer(model_name_or_path):
     device="hpu" if is_hpex_available() else "cuda"
 
     if args.quantize:
+        autoround_dtype_mapping = {
+            "MXFP4": "mx_fp4",
+            "MXFP8": "mx_fp8",
+            "NVFP4": "nv_fp4",
+            "uNVFP4": "fp4_v2",
+            "NVFP4+": "fp4_v2",
+        }
+        args.dtype = autoround_dtype_mapping[args.dtype]
         if args.quant_lm_head:
             lm_head_config = {
                 "group_size": 32 if "mx" in args.dtype else 16,
@@ -155,11 +156,10 @@ def load_recipe_results(file_path):
         autoround.quantize()
         model = autoround.model
 
-    # set dtype to BF16 for HPU inference performance
-    model = model.to(torch.bfloat16)
-    model = model.eval().to(device)
-
     if args.accuracy:
+        # set dtype to BF16 for HPU inference performance
+        model = model.to(torch.bfloat16)
+        model = model.eval().to(device)
         if is_hpex_available():
             # HPU needs padding to buckets for better performance
             # Generation tasks, such as gsm8k and mmlu-pro, may get OOM.
@@ -240,3 +240,12 @@ def load_recipe_results(file_path):
             for task_name, accu in all_accuracy.items():
                 print(f"Accuracy for {task_name}: {accu:.4f}")
             print(f"Overall accuracy: {sum(all_accuracy.values())/len(all_accuracy):.4f}")
+
+    if args.save:
+        if world_size > 1:
+            assert False, "model quantized with deepspeed tensor parallel is not supported to be saved."
+        elif args.use_recipe:
+            assert False, "model quantized with recipe is not supported to be saved."
+        else:
+            autoround.save_pretrained(args.save_path, format="llm_compressor")
+        print(f"Quantized model is saved to {args.save_path}")