From 4bad51493be5e156fc33d77fcda452222f1944fa Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 11 Oct 2025 10:12:09 +0800 Subject: [PATCH 1/5] update MXQuant doc Signed-off-by: Kaihui-intel --- docs/source/3x/PT_MXQuant.md | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) diff --git a/docs/source/3x/PT_MXQuant.md b/docs/source/3x/PT_MXQuant.md index 44dbd05e7d4..cae57208315 100644 --- a/docs/source/3x/PT_MXQuant.md +++ b/docs/source/3x/PT_MXQuant.md @@ -83,19 +83,43 @@ The exponent (exp) is equal to torch.floor(torch.log2(amax)), MAX is the represe ## Get Started with Microscaling Quantization API -To get a model quantized with Microscaling Data Types, users can use the Microscaling Quantization API as follows. +To get a model quantized with Microscaling Data Types, users can use the AutoRound Quantization API as follows. ```python -from neural_compressor.torch.quantization import MXQuantConfig, prepare, convert +from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert +from transformers import AutoModelForCausalLM, AutoTokenizer + +fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", device_map="auto",) +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) +output_dir = "./saved_inc" + +# quantization configuration +quant_config = AutoRoundConfig(tokenizer=tokenizer, + nsamples=32, + seqlen=32, + iters=20, + scheme="MXFP4", # MXFP4, MXFP8 + export_format="auto_round", + output_dir=output_dir, # default is "temp_auto_round" +) + +# quantize the model and save to output_dir +model = prepare(model=fp32_model, quant_config=quant_config) +model = convert(model) + +# loading +model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype="auto", device_map="auto") + +# inference +text = "There is a girl who likes adventure," +inputs = tokenizer(text, return_tensors="pt").to(model.device) +print(tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) -quant_config = MXQuantConfig(w_dtype=args.w_dtype, act_dtype=args.act_dtype, weight_only=args.woq) -user_model = prepare(model=user_model, quant_config=quant_config) -user_model = convert(model=user_model) ``` ## Examples -- PyTorch [huggingface models](/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/mx_quant) +- PyTorch [huggingface models](/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4) ## Reference From 6d6f14b0f41cfaaf3d91f220f218ff3143c37649 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 11 Oct 2025 03:19:10 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/source/3x/PT_MXQuant.md | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docs/source/3x/PT_MXQuant.md b/docs/source/3x/PT_MXQuant.md index cae57208315..65ad53317bd 100644 --- a/docs/source/3x/PT_MXQuant.md +++ b/docs/source/3x/PT_MXQuant.md @@ -89,18 +89,22 @@ To get a model quantized with Microscaling Data Types, users can use the AutoRou from neural_compressor.torch.quantization import AutoRoundConfig, prepare, convert from transformers import AutoModelForCausalLM, AutoTokenizer -fp32_model = AutoModelForCausalLM.from_pretrained("facebook/opt-125m", device_map="auto",) +fp32_model = AutoModelForCausalLM.from_pretrained( + "facebook/opt-125m", + device_map="auto", +) tokenizer = AutoTokenizer.from_pretrained("facebook/opt-125m", trust_remote_code=True) output_dir = "./saved_inc" # quantization configuration -quant_config = AutoRoundConfig(tokenizer=tokenizer, +quant_config = AutoRoundConfig( + tokenizer=tokenizer, nsamples=32, seqlen=32, iters=20, - scheme="MXFP4", # MXFP4, MXFP8 + scheme="MXFP4", # MXFP4, MXFP8 export_format="auto_round", - output_dir=output_dir, # default is "temp_auto_round" + output_dir=output_dir, # default is "temp_auto_round" ) # quantize the model and save to output_dir @@ -114,7 +118,6 @@ model = AutoModelForCausalLM.from_pretrained(output_dir, torch_dtype="auto", dev text = "There is a girl who likes adventure," inputs = tokenizer(text, return_tensors="pt").to(model.device) print(tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) - ``` ## Examples From b317f05df3ee5f1b90f038ac3e5e1f89713fc181 Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 11 Oct 2025 12:43:55 +0800 Subject: [PATCH 3/5] rename example link Signed-off-by: Kaihui-intel --- docs/source/3x/PT_MXQuant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/3x/PT_MXQuant.md b/docs/source/3x/PT_MXQuant.md index cae57208315..6f5c406347c 100644 --- a/docs/source/3x/PT_MXQuant.md +++ b/docs/source/3x/PT_MXQuant.md @@ -119,7 +119,7 @@ print(tokenizer.decode(model.generate(**inputs, max_new_tokens=10)[0])) ## Examples -- PyTorch [huggingface models](/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4) +- PyTorch [LLM/VLM models](/examples/pytorch/multimodal-modeling/quantization/auto_round/llama4) ## Reference From b1ea183cc7f72f5f77db6935cf35ebc8348568cf Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 11 Oct 2025 14:17:11 +0800 Subject: [PATCH 4/5] remove wrong sentence Signed-off-by: Kaihui-intel --- docs/source/3x/PT_MXQuant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/3x/PT_MXQuant.md b/docs/source/3x/PT_MXQuant.md index 75b94b88f4c..c8ca399c38f 100644 --- a/docs/source/3x/PT_MXQuant.md +++ b/docs/source/3x/PT_MXQuant.md @@ -70,7 +70,7 @@ Neural Compressor seamlessly applies the MX data type to post-training quantizat -The memory and computational limits of LLMs are more severe than other general neural networks, so our exploration focuses on LLMs first. The following table shows the basic MX quantization recipes in Neural Compressor and enumerates distinctions among various data types. The MX data type replaces general float scale with powers of two to be more hardware-friendly. It adapts a granularity falling between per-channel and per-tensor to balance accuracy and memory consumption. +The memory and computational limits of LLMs are more severe than other general neural networks, so our exploration focuses on LLMs first. The following table shows the basic MX quantization recipes in Neural Compressor and enumerates distinctions among various data types. The MX data type replaces general float scale with powers of two to be more hardware-friendly. | | MX Format | INT8 | FP8 | |------------|--------------|------------|------------| From 2d9c95de8344247d5a259c9095f92bc7d9cc9a9a Mon Sep 17 00:00:00 2001 From: Kaihui-intel Date: Sat, 11 Oct 2025 15:55:54 +0800 Subject: [PATCH 5/5] update formular Signed-off-by: Kaihui-intel --- docs/source/3x/PT_MXQuant.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/3x/PT_MXQuant.md b/docs/source/3x/PT_MXQuant.md index c8ca399c38f..3472aabea44 100644 --- a/docs/source/3x/PT_MXQuant.md +++ b/docs/source/3x/PT_MXQuant.md @@ -78,7 +78,7 @@ The memory and computational limits of LLMs are more severe than other general n | Zero point | 0 (None) | $2^{bits - 1}$ or $-min * scale$ | 0 (None) | | Granularity | per-block (default blocksize is 32) | per-channel or per-tensor | per-channel or per-tensor | -The exponent (exp) is equal to torch.floor(torch.log2(amax)), MAX is the representation range of the data type, amax is the max absolute value of per-block tensor, and rmin is the minimum value of the per-block tensor. +The exponent (exp) is equal to clamp(floor(log2(amax)) - maxExp, -127, 127), MAX is the representation range of the data type, amax is the max absolute value of per-block tensor, and rmin is the minimum value of the per-block tensor. ## Get Started with Microscaling Quantization API