squash

kylesayrs · kylesayrs · commit b85337fb6b40 · 2025-10-09T12:39:09.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/setup.py b/setup.py
@@ -160,7 +160,7 @@ def localversion_func(version: ScmVersion) -> str:
             "torchvision",
             "librosa==0.11.0",
             "soundfile",
-            "torchcodec",
+            #"torchcodec",
             # linting, formatting, and type checking
             "mypy~=1.10.0",
             "ruff~=0.4.8",
diff --git a/src/llmcompressor/modifiers/quantization/__init__.py b/src/llmcompressor/modifiers/quantization/__init__.py
@@ -1,5 +1,4 @@
 # ruff: noqa
 
-from .cache import *
 from .gptq import *
 from .quantization import *
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -1,22 +1,17 @@
-import inspect
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Optional
 
 import torch
 from compressed_tensors.quantization import (
     DynamicType,
-    KVCacheScaleType,
     QuantizationArgs,
-    QuantizationScheme,
     QuantizationStatus,
     QuantizationStrategy,
 )
 from compressed_tensors.quantization.lifecycle.forward import forward_quantize
-from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
 from compressed_tensors.utils import align_module_device, update_offload_parameter
 from loguru import logger
 from torch.nn import Module
 
-from llmcompressor.modifiers.quantization.cache import QuantizedKVParameterCache
 from llmcompressor.observers import Observer
 from llmcompressor.utils.helpers import getattr_chain
 
@@ -25,13 +20,13 @@
     "update_weight_zp_scale",
     "calibrate_input_hook",
     "calibrate_output_hook",
-    "calibrate_kv_cache_input_hook",
-    "calibrate_kv_cache_output_hook",
-    "initialize_quantized_kv_cache",
     "freeze_module_quantization",
     "apply_calibration_status",
     "reset_quantization_status",
     "update_weight_global_scale",
+    "calibrate_query_hook",
+    "calibrate_key_hook",
+    "calibrate_value_hook",
 ]
 
 
@@ -151,8 +146,9 @@ def calibrate_activations(module: Module, value: torch.Tensor, base_name: str):
     if value.numel() == 0:
         return
 
-    quantization_scheme = getattr(module, "quantization_scheme", None)
-    quantization_args = getattr(quantization_scheme, f"{base_name}_activations", None)
+    field_name = "input" if base_name != "output" else "output"  # input,q,k,v,output
+    args_attr = f"quantization_scheme.{field_name}_activations"
+    quantization_args = getattr_chain(module, args_attr, None)
 
     calculate_qparams = True
     calculate_gparam = False
@@ -202,60 +198,16 @@ def calibrate_output_hook(module: Module, _args: Any, output: torch.Tensor):
     return output
 
 
-def calibrate_kv_cache_input_hook(
-    module: Module, args: Any, kwargs: Dict[str, Any]
-) -> Tuple[Tuple[Any, ...], Dict[str, Any]]:
-    """
-    Hook to update inputs to attention layers when running
-    kv_cache quantization. Will update the passed in
-    kv_cache to singleton QuantizedKVParameterCache.
-    """
-    kv_cache = getattr(module, "kv_cache")
-    if not hasattr(module, "_past_kv_name"):
-        # Determine which past KV parameter name to use once and cache it
-        # TODO: Find a better place to cache this
-        module._past_kv_name = (
-            "past_key_value"  # transformers#39956
-            if "past_key_value" in inspect.signature(module.forward).parameters
-            else "past_key_values"
-        )
-
-    kwargs[module._past_kv_name] = kv_cache
-    kwargs["use_cache"] = False
-    return args, kwargs
+def calibrate_query_hook(module: Module, query_states: torch.Tensor):
+    calibrate_activations(module, query_states, base_name="q")
 
 
-def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Tensor):
-    """
-    Hook to update k_scale and v_scale parameters when running kv_cache quantization.
-    """
-    kv_cache = getattr(module, "kv_cache")
-    k_scale = kv_cache.k_scales[module.layer_idx]
-    v_scale = kv_cache.v_scales[module.layer_idx]
-    update_offload_parameter(module, KVCacheScaleType.KEY.value, k_scale)
-    update_offload_parameter(module, KVCacheScaleType.VALUE.value, v_scale)
+def calibrate_key_hook(module: Module, key_states: torch.Tensor):
+    calibrate_activations(module, key_states, base_name="k")
 
 
-def initialize_quantized_kv_cache(module: Module):
-    """
-    Initialize a quantized kv_cache on a module (analogous to initializing an observer)
-    When a config specifying kv_cache quantization is applied to a model, the kv_cache
-    args are redefined as the output_activations targeting attention modules.
-
-    This function should be called on attention modules with output_activations
-    """
-    scheme: Optional[QuantizationScheme] = getattr(module, "quantization_scheme", None)
-    existing_kv_cache = getattr(module, "kv_cache", None)
-
-    if (
-        scheme is None
-        or not is_kv_cache_quant_scheme(scheme)
-        or isinstance(existing_kv_cache, QuantizedKVParameterCache)
-    ):
-        return
-
-    quantized_kv_cache = QuantizedKVParameterCache(scheme.output_activations)
-    setattr(module, "kv_cache", quantized_kv_cache)
+def calibrate_value_hook(module: Module, value_states: torch.Tensor):
+    calibrate_activations(module, value_states, base_name="v")
 
 
 def apply_calibration_status(module: Module):
@@ -284,16 +236,11 @@ def freeze_module_quantization(module: Module):
         return
 
     # remove observers
-    for name in ("input", "weight", "output"):
+    for name in ("input", "weight", "output", "q", "k", "v"):
         obs_name = f"{name}_observer"
         if hasattr(module, obs_name):
             delattr(module, obs_name)
 
-    # remove quantized kv_cache
-    kv_cache = getattr(module, "kv_cache", None)
-    if isinstance(kv_cache, QuantizedKVParameterCache):
-        delattr(module, "kv_cache")
-
     module.quantization_status = QuantizationStatus.FROZEN
 
 
diff --git a/src/llmcompressor/modifiers/quantization/quantization/mixin.py b/src/llmcompressor/modifiers/quantization/quantization/mixin.py
@@ -1,6 +1,13 @@
 from typing import Any, Dict, List, Optional, Set, Union
 
 import torch
+from compressed_tensors.modeling import (
+    IMPL_ATTR,
+    KV_CACHE_ATTR,
+    register_key_hook,
+    register_query_hook,
+    register_value_hook,
+)
 from compressed_tensors.quantization import (
     DynamicType,
     QuantizationArgs,
@@ -21,12 +28,12 @@
 from llmcompressor.modifiers.quantization.calibration import (
     apply_calibration_status,
     calibrate_input_hook,
-    calibrate_kv_cache_input_hook,
-    calibrate_kv_cache_output_hook,
+    calibrate_key_hook,
     calibrate_output_hook,
+    calibrate_query_hook,
+    calibrate_value_hook,
     freeze_module_quantization,
     initialize_observer,
-    initialize_quantized_kv_cache,
     reset_quantization_status,
 )
 from llmcompressor.modifiers.utils.hooks import HooksMixin
@@ -253,19 +260,21 @@ def _initialize_observers(self, module: torch.nn.Module):
 
         # input activations
         if input:
-            initialize_observer(module, base_name="input")
+            if not is_attention:
+                initialize_observer(module, base_name="input")
+            else:
+                if hasattr(module, IMPL_ATTR):
+                    initialize_observer(module, base_name="q")
+                if hasattr(module, KV_CACHE_ATTR):
+                    initialize_observer(module, base_name="k")
+                    initialize_observer(module, base_name="v")
 
         # weight observers (used by `update_weight_zp_scale` or child modifier)
         if weight:
             initialize_observer(module, base_name="weight")
 
-        # kv_cache activations. Within `apply_quantization_config`, the config is
-        # modified to use attention output quantization if a kv_cache_scheme exists
-        if is_attention and output:
-            initialize_quantized_kv_cache(module)
-
         # output activations
-        elif output:
+        if output:
             initialize_observer(module, base_name="output")
 
     def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
@@ -284,29 +293,19 @@ def _initialize_hooks(self, model: torch.nn.Module) -> Set[RemovableHandle]:
 
             # input activations
             if input:
-                hooks.add(
-                    self.register_hook(module, calibrate_input_hook, "forward_pre")
-                )
-
-            # kv_cache activations. Within `apply_quantization_config`, the config is
-            # modified to use attention output quantization if a kv_cache_scheme exists
-            if is_attention and output:
-                hooks.add(
-                    self.register_hook(
-                        module,
-                        calibrate_kv_cache_input_hook,
-                        "forward_pre",
-                        with_kwargs=True,
+                if not is_attention:
+                    hooks.add(
+                        self.register_hook(module, calibrate_input_hook, "forward_pre")
                     )
-                )
-                hooks.add(
-                    self.register_hook(
-                        module, calibrate_kv_cache_output_hook, "forward"
-                    )
-                )
+                else:
+                    if hasattr(module, IMPL_ATTR):
+                        hooks.add(register_query_hook(module, calibrate_query_hook))
+                    if hasattr(module, KV_CACHE_ATTR):
+                        hooks.add(register_key_hook(module, calibrate_key_hook))
+                        hooks.add(register_value_hook(module, calibrate_value_hook))
 
             # output activations
-            elif output:
+            if output:
                 hooks.add(self.register_hook(module, calibrate_output_hook, "forward"))
 
         return hooks
diff --git a/src/llmcompressor/modifiers/utils/hooks.py b/src/llmcompressor/modifiers/utils/hooks.py
@@ -1,6 +1,7 @@
 import contextlib
+from copy import deepcopy
 from functools import wraps
-from typing import Any, Callable, ClassVar, Optional, Set, Union
+from typing import Any, Callable, ClassVar, Dict, Optional, Set, Union
 
 import torch
 from loguru import logger
@@ -39,6 +40,7 @@ class HooksMixin(BaseModel):
     # attached to global HooksMixin class
     _HOOKS_DISABLED: ClassVar[bool] = False
     _HOOKS_KEEP_ENABLED: ClassVar[Set[RemovableHandle]] = set()
+    _HOOKS_TO_MODIFIER: ClassVar[Dict[RemovableHandle, "HooksMixin"]] = dict()
 
     # attached to local subclasses
     _hooks: Set[RemovableHandle] = set()
@@ -95,6 +97,7 @@ def wrapped_hook(*args, **kwargs):
         register_function = getattr(target, f"register_{hook_type}_hook")
         handle = register_function(wrapped_hook, **kwargs)
         self._hooks.add(handle)
+        self._HOOKS_TO_MODIFIER[handle] = self
         logger.debug(f"{self} added {handle}")
 
         return handle
@@ -113,3 +116,13 @@ def remove_hooks(self, handles: Optional[Set[RemovableHandle]] = None):
             hook.remove()
 
         self._hooks -= handles
+        for handle in handles:
+            self._HOOKS_TO_MODIFIER.pop(handle, None)
+
+    @classmethod
+    def remove_hooks_by_id(cls, ids: Set[int]):
+        handles = deepcopy(cls._HOOKS_TO_MODIFIER)
+        for handle in handles:
+            if handle.id in ids:
+                modifier = cls._HOOKS_TO_MODIFIER[handle]
+                modifier.remove_hooks(set(handle))
diff --git a/src/llmcompressor/observers/helpers.py b/src/llmcompressor/observers/helpers.py
@@ -52,6 +52,8 @@ def flatten_for_calibration(
 def _flatten_weight(
     value: torch.Tensor, args: QuantizationArgs, g_idx: Optional[torch.Tensor] = None
 ):
+    # value.shape = (num_rows, num_cols)
+
     if args.strategy == QuantizationStrategy.TENSOR:
         # (1, 1, num_weight_elems)
         return value.reshape((1, 1, -1))
@@ -87,6 +89,8 @@ def _flatten_weight(
 
 
 def _flatten_activation(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (batch_size, seq_len, hidden_dim)
+
     if args.strategy == QuantizationStrategy.TENSOR:
         # (batch_size * seq_len, 1, hidden_dim)
         return value.reshape((-1, 1, value.size(-1)))
@@ -111,10 +115,11 @@ def _flatten_activation(value: torch.Tensor, args: QuantizationArgs):
 
 
 def _flatten_attention(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (batch_size, num_heads, seq_len, head_dim)
+
     if args.strategy == QuantizationStrategy.TENSOR:
-        # (batch_size, seq_len, num_heads, head_dim)
         # (batch_size * seq_len, 1, num_heads * head_dim)
-        return value.flatten(0, 1).flatten(-2, -1).unsqueeze(-2)
+        return value.transpose(1, 2).flatten(0, 1).flatten(-2, -1).unsqueeze(-2)
 
     if args.strategy == QuantizationStrategy.TOKEN:
         raise ValueError("Token quantization cannot be applied to attention")
diff --git a/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py b/tests/llmcompressor/transformers/kv_cache/test_kv_cache.py
@@ -3,7 +3,7 @@
 
 import pytest
 from accelerate import init_empty_weights
-from compressed_tensors.quantization import KVCacheScaleType, is_attention_module
+from compressed_tensors.quantization import is_attention_module
 from datasets import load_dataset
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 from transformers.utils.quantization_config import CompressedTensorsConfig
@@ -14,7 +14,7 @@
 NUM_CALIBRATION_SAMPLES = 16
 MAX_SEQUENCE_LENGTH = 512
 DATASET_ID = "HuggingFaceH4/ultrachat_200k"
-DATASET_SPLIT = "train_sft"
+DATASET_SPLIT = f"train_sft[:{NUM_CALIBRATION_SAMPLES}]"
 
 MODEL_IDS = [
     "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
@@ -49,9 +49,11 @@ def _oneshot_fixture(tmp_path: Path):
             symmetric=symmetric,
         )
         oneshot_args = dict(
-            dataset="open_platypus",
             recipe=recipe,
-            num_calibration_samples=16,
+            dataset="open_platypus",
+            splits={"calibration": f"train[:{NUM_CALIBRATION_SAMPLES}]"},
+            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
+            max_seq_length=MAX_SEQUENCE_LENGTH,
         )
         for model_id in MODEL_IDS:
             oneshot_args["output_dir"] = os.path.join(tmp_path, model_id)
@@ -161,8 +163,8 @@ def test_kv_cache_model_state_dict_attr(oneshot_fixture, tmp_path):
     for name, submodule in model.named_modules():
         if is_attention_module(submodule):
             counts += 1
-            assert hasattr(submodule, KVCacheScaleType.VALUE.value)
-            assert hasattr(submodule, KVCacheScaleType.KEY.value)
+            assert hasattr(submodule, "v_scale")
+            assert hasattr(submodule, "k_scale")
     assert counts > 0
 
 
@@ -200,8 +202,8 @@ def test_kv_cache_gptq_config_format(kv_cache_fixture, tmp_path):
     for name, submodule in model.named_modules():
         if is_attention_module(submodule):
             counts += 1
-            assert hasattr(submodule, KVCacheScaleType.VALUE.value)
-            assert hasattr(submodule, KVCacheScaleType.KEY.value)
+            assert hasattr(submodule, "v_scale")
+            assert hasattr(submodule, "k_scale")
 
     assert counts > 0
 
@@ -240,7 +242,7 @@ def test_kv_cache_gptq_model_state_dict_attr(kv_cache_fixture, tmp_path):
     for name, submodule in model.named_modules():
         if is_attention_module(submodule):
             counts += 1
-            assert hasattr(submodule, KVCacheScaleType.VALUE.value)
-            assert hasattr(submodule, KVCacheScaleType.KEY.value)
+            assert hasattr(submodule, "v_scale")
+            assert hasattr(submodule, "k_scale")
 
     assert counts > 0