vllm-project · ralphbean · Oct 7, 2025 · Oct 17, 2025 · Oct 23, 2025 · Oct 24, 2025
diff --git a/src/llmcompressor/args/model_arguments.py b/src/llmcompressor/args/model_arguments.py
@@ -50,10 +50,6 @@ class ModelArguments:
             "help": "Pretrained processor name or path if not the same as model_name"
         },
     )
-    cache_dir: str | None = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained data from huggingface.co"},
-    )
 
     use_auth_token: bool = field(
         default=False,

diff --git a/src/llmcompressor/entrypoints/oneshot.py b/src/llmcompressor/entrypoints/oneshot.py
@@ -225,7 +225,6 @@ def oneshot(
     config_name: Optional[str] = None,
     tokenizer: Optional[Union[str, PreTrainedTokenizerBase]] = None,
     processor: Optional[Union[str, ProcessorMixin]] = None,
-    cache_dir: Optional[str] = None,
     use_auth_token: bool = False,
     precision: str = "auto",
     tie_word_embeddings: bool = False,
@@ -273,8 +272,6 @@ def oneshot(
         model_name.
     :param processor: Pretrained processor name or path if not the same as
         model_name.
-    :param cache_dir: Where to store the pretrained data from
-        huggingface.co.
     :param use_auth_token: Whether to use Hugging Face auth token for private
         models.
     :param precision: Precision to cast model weights to, default to auto.

diff --git a/src/llmcompressor/entrypoints/utils.py b/src/llmcompressor/entrypoints/utils.py
@@ -175,7 +175,7 @@ def initialize_model_from_path(
     model_path = model_args.model
     config = AutoConfig.from_pretrained(
         model_args.config_name if model_args.config_name else model_path,
-        cache_dir=model_args.cache_dir,
+        cache_dir=None,
         revision=model_args.model_revision,
         use_auth_token=True if model_args.use_auth_token else None,
         trust_remote_code=model_args.trust_remote_code_model,
@@ -211,7 +211,7 @@ def initialize_model_from_path(
             )
             teacher_kwargs = {
                 "config": teacher_config,
-                "cache_dir": model_args.cache_dir,
+                "cache_dir": None,
                 "use_auth_token": True if model_args.use_auth_token else None,
                 "torch_dtype": parse_dtype(model_args.precision),
                 "device_map": teacher_device_map,
@@ -233,7 +233,7 @@ def initialize_model_from_path(
 
     model_kwargs = {
         "config": config,
-        "cache_dir": model_args.cache_dir,
+        "cache_dir": None,
         "revision": model_args.model_revision,
         "use_auth_token": True if model_args.use_auth_token else None,
         "torch_dtype": parse_dtype(model_args.precision),
@@ -266,7 +266,7 @@ def initialize_processor_from_path(
     try:
         processor = AutoProcessor.from_pretrained(
             processor_src,
-            cache_dir=model_args.cache_dir,
+            cache_dir=None,
             use_fast=True,
             revision=model_args.model_revision,
             use_auth_token=True if model_args.use_auth_token else None,
@@ -285,7 +285,7 @@ def initialize_processor_from_path(
         logger.debug("Could not load fast processor, loading slow processor instead")
         processor = AutoProcessor.from_pretrained(
             processor_src,
-            cache_dir=model_args.cache_dir,
+            cache_dir=None,
             use_fast=False,
             revision=model_args.model_revision,
             use_auth_token=True if model_args.use_auth_token else None,

diff --git a/src/llmcompressor/pytorch/model_load/helpers.py b/src/llmcompressor/pytorch/model_load/helpers.py
@@ -149,16 +149,17 @@ def copy_python_files_from_model_cache(model, save_path: str):
         import shutil
 
         from huggingface_hub import hf_hub_download
-        from transformers import TRANSFORMERS_CACHE
         from transformers.utils import http_user_agent
 
         cache_path = config._name_or_path
         if not os.path.exists(cache_path):
             user_agent = http_user_agent()
+            # Use cache_dir=None to respect HF_HOME, HF_HUB_CACHE, and other
+            # environment variables for cache location
             config_file_path = hf_hub_download(
                 repo_id=cache_path,
                 filename="config.json",
-                cache_dir=TRANSFORMERS_CACHE,
+                cache_dir=None,
                 force_download=False,
                 user_agent=user_agent,
             )

diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
@@ -195,7 +195,7 @@ def load_dataset(self):
         logger.debug(f"Loading dataset {self.dataset_args.dataset}")
         return get_raw_dataset(
             self.dataset_args,
-            None,
+            cache_dir=None,
             split=self.split,
             streaming=self.dataset_args.streaming,
             **self.dataset_args.raw_kwargs,

diff --git a/tests/llmcompressor/transformers/finetune/data/conftest.py b/tests/llmcompressor/transformers/finetune/data/conftest.py
@@ -18,7 +18,6 @@ def tiny_llama_model_args(tiny_llama_path):
 def tiny_llama_tokenizer(tiny_llama_model_args):
     tokenizer = AutoTokenizer.from_pretrained(
         tiny_llama_model_args.model,
-        cache_dir=tiny_llama_model_args.cache_dir,
         use_fast=True,
         revision=tiny_llama_model_args.model_revision,
         use_auth_token=True if tiny_llama_model_args.use_auth_token else None,