intel
diff --git a/‎examples/pytorch/multimodal-modeling/quantization/auto_round/mllm.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/pytorch/multimodal-modeling/quantization/auto_round/mllm.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎neural_compressor/torch/algorithms/weight_only/autoround.py‎
Lines changed: 102 additions & 108 deletions b/‎neural_compressor/torch/algorithms/weight_only/autoround.py‎
Lines changed: 102 additions & 108 deletions
@@ -223,7 +223,6 @@ def tune(args):
         use_auto_mapping = True
 
     woq_config = AutoRoundConfig(
-        is_vlm=True,
         bits=args.bits,
         sym=not args.asym,
         group_size=args.group_size,
 
@@ -34,9 +34,9 @@ def _is_auto_round_available():
 _is_auto_round_available()
 
 from auto_round import AutoRound, AutoRoundMLLM  # pylint: disable=E0401
+from auto_round.compressors.mllm.eval import lmms_eval, mllm_eval
+from auto_round.compressors.mllm.template import Template, get_template
 from auto_round.export.export_to_itrex.export import pack_model  # pylint: disable=E0401
-from auto_round.mllm import lmms_eval, mllm_eval
-from auto_round.mllm.template import Template, get_template
 from auto_round.schemes import QuantizationScheme
 
 from neural_compressor.torch.algorithms import Quantizer
@@ -50,11 +50,24 @@ class AutoRoundQuantizer(Quantizer):
 
     def __init__(
         self,
-        quant_config: dict = {},
+        bits: int = None,
+        group_size: int = None,
+        sym: bool = None,
+        data_type: str = None,
+        act_bits: int = None,
+        act_group_size: int = None,
+        act_sym: bool = None,
+        act_data_type: str = None,
+        act_dynamic: bool = None,
+        super_bits: int = None,
+        super_group_size: int = None,
+        quant_config: dict = {},  # for INC
+        layer_config: dict[str, Union[str, dict, QuantizationScheme]] = None,
         enable_full_range: bool = False,  ##for symmetric, TODO support later
         batch_size: int = 8,
         amp: bool = True,
         device_map: str = None,
+        quant_lm_head: bool = False,
         lr_scheduler=None,
         dataset: Union[str, list, tuple, torch.utils.data.DataLoader] = "NeelNanda/pile-10k",
         enable_quanted_input: bool = True,
@@ -71,21 +84,14 @@ def __init__(
         gradient_accumulate_steps: int = 1,
         not_use_best_mse: bool = False,
         dynamic_max_gap: int = -1,
-        data_type: str = "int",
         scale_dtype: str = "fp16",
         to_quant_block_names: list = None,
-        act_bits: int = 32,
-        act_group_size: int = None,
-        act_sym: bool = None,
-        act_dynamic: bool = True,
-        act_data_type: Optional[str] = None,
         low_cpu_mem_usage: bool = False,
         export_format: str = "itrex",
         # v0.4
         enable_norm_bias_tuning: bool = False,
         enable_torch_compile: bool = None,
         # mllm
-        is_mllm: bool = False,
         quant_nontext_module: bool = False,
         extra_data_dir: str = None,
         image_processor=None,
@@ -119,13 +125,15 @@ def __init__(
                     bits (int): Number of bits for quantization (default is 4).
                     group_size (int): Size of the quantization group (default is 128).
                     sym (bool): Whether to use symmetric quantization. (default is None).
+            layer_config (dict, optional): Layer-wise quantization config. Defaults to None.
             bits (int): Number of bits for quantization (default is 4).
             group_size (int): Size of the quantization group (default is 128).
             sym (bool): Whether symmetric quantization is to be used (default is False).
             enable_full_range (bool): Whether to enable full range quantization (default is False).
             batch_size (int): Batch size for training (default is 8).
             amp (bool): Whether to use automatic mixed precision (default is True).
             device_map: The device to be used for tuning (default is None).
+            quant_lm_head (bool): Indicates whether quantize the lm_head layer in transformers. (default is False).
             lr_scheduler: The learning rate scheduler to be used.
             dataset (str): The default dataset name (default is "NeelNanda/pile-10k").
             enable_quanted_input (bool): Whether to use the output of the previous quantized block as
@@ -155,7 +163,6 @@ def __init__(
             enable_norm_bias_tuning (bool): Whether to enable fast norm/layer_bias tuning.
             enable_torch_compile (bool): Whether to enable torch compile to optimize quant_block/layer, torch>=2.6 True.
             quant_nontext_module (bool): Whether to quantize nontext module.
-            is_mllm (bool): Indicates whether the model to be quantized is a multi-modal model (MLLM).
             extra_data_dir (str): The path for extra data such as images, audio or videos.
             processor (transformers.AutoProcessor): Any multi-modal model will require an object to encode or
                 decode the data that groups several modalities (among text, vision and audio).
@@ -170,12 +177,26 @@ def __init__(
             The quantized model.
         """
         super().__init__(quant_config)
-        self.tokenizer = "Placeholder"  # for AutoRound initialization
+        self.layer_config = layer_config
+        self.output_dir = kwargs.pop("output_dir", "temp_auto_round")
+        self.tokenizer = kwargs.pop("tokenizer", "Placeholder")  # for AutoRound initialization
         self.enable_full_range = enable_full_range
+        self.bits = bits
+        self.group_size = group_size
+        self.sym = sym
+        self.data_type = data_type
+        self.act_bits = act_bits
+        self.act_group_size = act_group_size
+        self.act_sym = act_sym
+        self.act_data_type = act_data_type
+        self.act_dynamic = act_dynamic
+        self.super_bits = super_bits
+        self.super_group_size = super_group_size
         self.batch_size = batch_size
         self.amp = amp
         self.device = get_accelerator(kwargs.pop("device", "auto")).name()
         self.lr_scheduler = lr_scheduler
+        self.dataset = dataset
         self.enable_quanted_input = enable_quanted_input
         self.enable_minmax_tuning = enable_minmax_tuning
         self.lr = lr
@@ -190,19 +211,12 @@ def __init__(
         self.gradient_accumulate_steps = gradient_accumulate_steps
         self.not_use_best_mse = not_use_best_mse
         self.dynamic_max_gap = dynamic_max_gap
-        self.data_type = data_type
         self.scale_dtype = scale_dtype
         self.to_quant_block_names = to_quant_block_names
-        self.act_bits = act_bits
-        self.act_group_size = act_group_size
-        self.act_sym = act_sym
-        self.act_dynamic = act_dynamic
-        self.act_data_type = act_data_type
         self.low_cpu_mem_usage = low_cpu_mem_usage
         self.export_format = export_format
         self.enable_norm_bias_tuning = enable_norm_bias_tuning
         self.enable_torch_compile = enable_torch_compile
-        self.is_mllm = is_mllm
         self.quant_nontext_module = quant_nontext_module
         self.extra_data_dir = extra_data_dir
         self.processor = processor
@@ -211,9 +225,10 @@ def __init__(
         self.truncation = truncation
         self.scheme = scheme
         self.device_map = device_map
+        self.quant_lm_head = quant_lm_head
         self.enable_w4afp8 = self._is_w4afp8()
 
-    def _is_w4afp8(self):
+    def _is_w4afp8(self) -> bool:
         return any([v.get("data_type", None) == "fp8_to_int_sym" for v in self.quant_config.values()])
 
     def prepare(self, model: torch.nn.Module, *args, **kwargs):
@@ -237,96 +252,75 @@ def convert(self, model: torch.nn.Module, *args, **kwargs):
         Returns:
             The quantized model.
         """
-        dataloader = CapturedDataloader(model.args_list, model.kwargs_list)
-        model = model.orig_model
-        if self.is_mllm:
-            rounder = AutoRoundMLLM(
-                model,
-                tokenizer=self.tokenizer,
-                scheme=self.scheme,
-                processor=self.processor,
-                image_processor=self.image_processor,
-                layer_config=self.quant_config,
-                batch_size=self.batch_size,
-                amp=self.amp,
-                device_map=self.device_map,
-                lr_scheduler=self.lr_scheduler,
-                dataset=dataloader,
-                extra_data_dir=self.extra_data_dir,
-                template=self.template,
-                quant_nontext_module=self.quant_nontext_module,
-                enable_quanted_input=self.enable_quanted_input,
-                enable_minmax_tuning=self.enable_minmax_tuning,
-                lr=self.lr,
-                minmax_lr=self.minmax_lr,
-                low_gpu_mem_usage=self.low_gpu_mem_usage,
-                low_cpu_mem_usage=self.low_gpu_mem_usage,
-                iters=self.iters,
-                seqlen=self.seqlen,
-                nsamples=self.nsamples,
-                sampler=self.sampler,
-                seed=self.seed,
-                nblocks=self.nblocks,
-                gradient_accumulate_steps=self.gradient_accumulate_steps,
-                not_use_best_mse=self.not_use_best_mse,
-                dynamic_max_gap=self.dynamic_max_gap,
-                data_type=self.data_type,
-                scale_dtype=self.scale_dtype,
-                act_bits=self.act_bits,
-                act_group_size=self.act_group_size,
-                act_sym=self.act_sym,
-                act_dynamic=self.act_dynamic,
-                to_quant_block_names=self.to_quant_block_names,
-                enable_norm_bias_tuning=self.enable_norm_bias_tuning,
-                truncation=self.truncation,
-                enable_torch_compile=self.enable_torch_compile,
-            )
+        tokenizer = getattr(model.orig_model, "tokenizer", None)
+        if tokenizer is not None:
+            delattr(model.orig_model, "tokenizer")
         else:
-            rounder = AutoRound(
-                model=model,
-                tokenizer=self.tokenizer,
-                scheme=self.scheme,
-                dataset=dataloader,
-                layer_config=self.quant_config or {},
-                enable_full_range=self.enable_full_range,
-                batch_size=self.batch_size,
-                amp=self.amp,
-                device_map=self.device_map,
-                lr_scheduler=self.lr_scheduler,
-                enable_quanted_input=self.enable_quanted_input,
-                enable_minmax_tuning=self.enable_minmax_tuning,
-                lr=self.lr,
-                minmax_lr=self.minmax_lr,
-                low_gpu_mem_usage=self.low_gpu_mem_usage,
-                iters=self.iters,
-                seqlen=self.seqlen,
-                nsamples=self.nsamples,
-                sampler=self.sampler,
-                seed=self.seed,
-                nblocks=self.nblocks,
-                gradient_accumulate_steps=self.gradient_accumulate_steps,
-                not_use_best_mse=self.not_use_best_mse,
-                dynamic_max_gap=self.dynamic_max_gap,
-                data_type=self.data_type,
-                scale_dtype=self.scale_dtype,
-                to_quant_block_names=self.to_quant_block_names,
-                act_bits=self.act_bits,
-                act_group_size=self.act_group_size,
-                act_sym=self.act_sym,
-                act_dynamic=self.act_dynamic,
-                low_cpu_mem_usage=self.low_cpu_mem_usage,
-                enable_norm_bias_tuning=self.enable_norm_bias_tuning,
-                enable_torch_compile=self.enable_torch_compile,
-            )
-        model, weight_config = rounder.quantize()
-        model.autoround_config = weight_config
+            tokenizer = "Placeholder"
+            self.dataset = CapturedDataloader(model.args_list, model.kwargs_list)
+        model = model.orig_model
+        rounder = AutoRound(
+            model,
+            layer_config=self.layer_config,
+            bits=self.bits,
+            data_type=self.data_type,
+            group_size=self.group_size,
+            sym=self.sym,
+            act_bits=self.act_bits,
+            act_group_size=self.act_group_size,
+            act_sym=self.act_sym,
+            act_data_type=self.act_data_type,
+            act_dynamic=self.act_dynamic,
+            super_bits=self.super_bits,
+            super_group_size=self.super_group_size,
+            tokenizer=tokenizer,
+            scheme=self.scheme,
+            processor=self.processor,
+            image_processor=self.image_processor,
+            enable_full_range=self.enable_full_range,
+            batch_size=self.batch_size,
+            amp=self.amp,
+            device_map=self.device_map,
+            lr_scheduler=self.lr_scheduler,
+            dataset=self.dataset,
+            extra_data_dir=self.extra_data_dir,
+            template=self.template,
+            quant_nontext_module=self.quant_nontext_module,
+            enable_quanted_input=self.enable_quanted_input,
+            enable_minmax_tuning=self.enable_minmax_tuning,
+            lr=self.lr,
+            minmax_lr=self.minmax_lr,
+            low_gpu_mem_usage=self.low_gpu_mem_usage,
+            low_cpu_mem_usage=self.low_gpu_mem_usage,
+            iters=self.iters,
+            seqlen=self.seqlen,
+            nsamples=self.nsamples,
+            sampler=self.sampler,
+            seed=self.seed,
+            nblocks=self.nblocks,
+            gradient_accumulate_steps=self.gradient_accumulate_steps,
+            not_use_best_mse=self.not_use_best_mse,
+            dynamic_max_gap=self.dynamic_max_gap,
+            scale_dtype=self.scale_dtype,
+            to_quant_block_names=self.to_quant_block_names,
+            enable_norm_bias_tuning=self.enable_norm_bias_tuning,
+            truncation=self.truncation,
+            enable_torch_compile=self.enable_torch_compile,
+            quant_lm_head=self.quant_lm_head,
+        )
+
         if self.enable_w4afp8:
-            return rounder.save_quantized(output_dir="temp_auto_round", inplace=True)
+            model, weight_config = rounder.quantize()
+            model.autoround_config = weight_config
+            return rounder.save_quantized(output_dir=self.output_dir, inplace=True)
         elif "itrex" in self.export_format:
+            model, weight_config = rounder.quantize()
+            model.autoround_config = weight_config
             model = pack_model(model, weight_config, device=self.device, inplace=True)
         else:  # pragma: no cover
-            model = rounder.save_quantized(output_dir="temp_auto_round", format=self.export_format, inplace=True)
-
+            rounder.quantize_and_save(output_dir=self.output_dir, format=self.export_format, inplace=True)
+            model = rounder.model
+            model.autoround_config = rounder.layer_config
         return model
 
 
@@ -389,8 +383,8 @@ def get_mllm_dataloader(
         DataLoader: The DataLoader for the calibrated datasets.
     """
     from auto_round.calib_dataset import CALIB_DATASETS
-    from auto_round.mllm.autoround_mllm import _only_text_test
-    from auto_round.mllm.mllm_dataset import get_mllm_dataloader  # pylint: disable=E0401
+    from auto_round.compressors.mllm.compressor import _only_text_test
+    from auto_round.compressors.mllm.dataset import get_mllm_dataloader  # pylint: disable=E0401
 
     template = template if template is not None else model.config.model_type
     template = get_template(
@@ -424,7 +418,7 @@ def get_mllm_dataloader(
         nsamples = (nsamples // batch_size + 1) * batch_size
         logger.warning(f"'nsamples' is not divisible by 'batch_size', will adjusted to {nsamples}")
 
-    dataloader, batch_size, gradient_accumulate_steps = get_mllm_dataloader(
+    dataloader, batch_size, seqlen, gradient_accumulate_steps = get_mllm_dataloader(
         template=template,
         processor=processor,
         model=model,