Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ docker run -d --gpus all -v ... --shm-size=100g --name llama4 -it nvcr.io/nvidia
docker exec -it llama4 bash
git clone https://github.com/intel/neural-compressor.git
cd neural-compressor/examples/3.x_api/pytorch/multimodal-modeling/quantization/auto_round/llama4
# Use `INC_PT_ONLY=1 pip install git+https://github.com/intel/[email protected]` for the latest updates before neural-compressor v3.6 release
pip install neural-compressor-pt==3.6
# Use `pip install git+https://github.com/intel/[email protected]` for the latest updates before auto-round v0.8.0 release
pip install auto-round==0.8.0
bash setup.sh
```

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# Copyright (c) 2025 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse

import torch

torch.use_deterministic_algorithms(True, warn_only=True)
from transformers import AutoTokenizer, Llama4ForConditionalGeneration, AutoProcessor
from neural_compressor.torch.quantization import (
AutoRoundConfig,
convert,
prepare,
)


class BasicArgumentParser(argparse.ArgumentParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.add_argument("--model", "--model_name", "--model_name_or_path",
help="model name or path")

self.add_argument('--scheme', default="MXFP4", type=str,
help="quantizaion scheme.")

self.add_argument("--device", "--devices", default="auto", type=str,
help="the device to be used for tuning. The default is set to auto,"
"allowing for automatic detection."
"Currently, device settings support CPU, GPU, and HPU.")

self.add_argument("--export_format", default="llm_compressor", type=str,
help="the format to save the model"
)

self.add_argument("--output_dir", default="./tmp_autoround", type=str,
help="the directory to save quantized model")

self.add_argument("--fp_layers", default="", type=str,
help="layers to maintain original data type")


def setup_parser():
parser = BasicArgumentParser()

parser.add_argument("--iters", "--iter", default=0, type=int,
help=" iters")

args = parser.parse_args()
return args


def tune(args):
model_name = args.model
if model_name[-1] == "/":
model_name = model_name[:-1]
print(f"start to quantize {model_name}")

layer_config = {}
model = Llama4ForConditionalGeneration.from_pretrained(args.model, device_map=None, torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(args.model, trust_remote_code=True)
processor = AutoProcessor.from_pretrained(args.model, trust_remote_code=True)
fp_layers = args.fp_layers.replace(" ", "").split(",")
if len(fp_layers) > 0:
for n, m in model.named_modules():
if not isinstance(m, (torch.nn.Linear)):
continue
for name in fp_layers:
if name in n:
layer_config[n] = {"bits": 16, "act_bits": 16}
break

qconfig = AutoRoundConfig(
tokenizer=tokenizer,
iters=args.iters,
scheme=args.scheme,
layer_config=layer_config,
export_format="llm_compressor",
output_dir=args.output_dir,
processor=processor,
)
model = prepare(model, qconfig)
model = convert(model, qconfig)

if __name__ == '__main__':
args = setup_parser()
tune(args)
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
auto-round @ git+https://github.com/intel/[email protected]
lm-eval==0.4.9.1
setuptools_scm
torchao==0.12.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,11 @@ function run_tuning {
extra_cmd="--fp_layers lm-head,self_attn,router,vision_model,multi_modal_projector,shared_expert --scheme MXFP4"
fi

python3 -m auto_round \
--model ${input_model} \
--iters ${iters} \
--format "llm_compressor" \
--output_dir ${tuned_checkpoint} \
${extra_cmd}
python3 main.py \
--model ${input_model} \
--iters ${iters} \
--output_dir ${tuned_checkpoint} \
${extra_cmd}
}

main "$@"