Rectify Asym Compression/Decompression Pathways (#225)

dsikka · web-flow · commit 574299879e90 · 2025-03-26T12:43:08.000-04:00
* fix asym

* fix

* update tests

* fix

* update

* docstring, comments, typing
diff --git a/src/compressed_tensors/compressors/model_compressors/model_compressor.py b/src/compressed_tensors/compressors/model_compressors/model_compressor.py
@@ -19,7 +19,7 @@
 import re
 from contextlib import contextmanager
 from copy import deepcopy
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, TypeVar, Union
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Set, Tuple, TypeVar, Union
 
 import compressed_tensors
 import torch
@@ -522,10 +522,13 @@ def _replace_weights(self, dense_weight_generator, model: Module):
                 update_parameter_data(module, data, param_name)
 
 
-def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
+def map_modules_to_quant_args(
+    model: Module,
+) -> Dict[str, Union[QuantizationArgs, Tuple[QuantizationArgs, QuantizationArgs]]]:
     """
     Given a pytorch model, map out the submodule name (usually linear layers)
-     to the QuantizationArgs
+    to the weight QuantizationArgs. If running input activation quantization, will also
+    map to the input QuantizationArgs in a tuple.
 
     :param model: pytorch model
     """
@@ -535,6 +538,12 @@ def map_modules_to_quant_args(model: Module) -> Dict[str, QuantizationArgs]:
             if submodule.quantization_scheme.weights is not None:
                 name = fix_fsdp_module_name(name)
                 quantized_modules_to_args[name] = submodule.quantization_scheme.weights
+                if submodule.quantization_scheme.input_activations is not None:
+                    weight_args = quantized_modules_to_args.get(name)
+                    quantized_modules_to_args[name] = (
+                        weight_args,
+                        submodule.quantization_scheme.input_activations,
+                    )
 
     return quantized_modules_to_args
 
diff --git a/src/compressed_tensors/compressors/quantized_compressors/base.py b/src/compressed_tensors/compressors/quantized_compressors/base.py
@@ -82,19 +82,44 @@ def compress(
         """
         compressed_dict = {}
         weight_suffix = ".weight"
+        input_zp_suffix = ".input_zero_point"
+        weight_zp_suffix = ".weight_zero_point"
         _LOGGER.debug(
             f"Compressing model with {len(model_state)} parameterized layers..."
         )
 
         for name, value in tqdm(model_state.items(), desc="Quantized Compression"):
+            # check if the parameter we're compressing is the weight zp
+            # or the input zp
+            is_weight_zp = name.endswith(weight_zp_suffix)
+            is_input_zp = name.endswith(input_zp_suffix)
+
+            # if we're saving the weight zp, fetch weight quant args
+            if is_weight_zp:
+                quant_args_zp = names_to_scheme.get(name[: -(len(weight_zp_suffix))])
+                if isinstance(quant_args_zp, tuple):
+                    # If tuple, first value is weight args, second is input args
+                    quant_args_zp = quant_args_zp[0]
+
+            # if we're saving the input zp, fetch input quant args
+            if is_input_zp:
+                input_args_zp = names_to_scheme.get(name[: -(len(input_zp_suffix))])
+                if isinstance(input_args_zp, tuple):
+                    # If tuple, first value is weight args, second is input args
+                    input_args_zp = input_args_zp[-1]
+
             if name.endswith(weight_suffix):
                 prefix = name[: -(len(weight_suffix))]
                 scale = model_state.get(merge_names(prefix, "weight_scale"), None)
                 zp = model_state.get(merge_names(prefix, "weight_zero_point"), None)
                 g_idx = model_state.get(merge_names(prefix, "weight_g_idx"), None)
                 if scale is not None:
                     # weight is quantized, compress it
-                    quant_args = names_to_scheme[prefix]
+                    if isinstance(names_to_scheme[prefix], tuple):
+                        quant_args = names_to_scheme[prefix][0]
+                    else:
+                        quant_args = names_to_scheme[prefix]
+
                     compressed_data = self.compress_weight(
                         weight=value,
                         scale=scale,
@@ -107,7 +132,11 @@ def compress(
                         compressed_dict[merge_names(prefix, key)] = value
                 else:
                     compressed_dict[name] = value.to("cpu")
-            elif name.endswith("zero_point") and torch.all(value == 0):
+            # only save if asym
+            elif is_weight_zp and quant_args_zp.symmetric:
+                continue
+            # only save if asym
+            elif is_input_zp and input_args_zp.symmetric:
                 continue
             elif name.endswith("g_idx") and torch.any(value <= -1):
                 continue
diff --git a/tests/test_compressors/quantized_compressors/test_int_quant.py b/tests/test_compressors/quantized_compressors/test_int_quant.py
@@ -27,11 +27,13 @@
 from safetensors.torch import save_file
 
 
-def get_dummy_quant_config(strategy, group_size=None):
+def get_dummy_quant_config(strategy, group_size=None, symmetric=True):
     config_groups = {
         "group_1": QuantizationScheme(
             targets=["Linear"],
-            weights=QuantizationArgs(strategy=strategy, group_size=group_size),
+            weights=QuantizationArgs(
+                strategy=strategy, group_size=group_size, symmetric=symmetric
+            ),
         ),
     }
     ignore = ["lm_head"]
@@ -69,7 +71,9 @@ def test_quant_format(strategy, symmetric, group_size, sc, zp):
         "dummy.weight_scale": torch.tensor(sc, dtype=torch.float32),
         "dummy.weight_zero_point": torch.tensor(zp, dtype=torch.int32),
     }
-    quant_config = get_dummy_quant_config(strategy=strategy, group_size=group_size)
+    quant_config = get_dummy_quant_config(
+        strategy=strategy, group_size=group_size, symmetric=symmetric
+    )
 
     compressor = IntQuantizationCompressor(config=quant_config)
     quantized_modules_to_args = {"dummy": quant_config.config_groups["group_1"].weights}
diff --git a/tests/test_compressors/quantized_compressors/test_pack_quant.py b/tests/test_compressors/quantized_compressors/test_pack_quant.py
@@ -37,7 +37,9 @@
 from torch.nn.modules import Linear, Sequential
 
 
-def get_dummy_quant_config(num_bits=4, strategy=None, group_size=None, actorder=None):
+def get_dummy_quant_config(
+    num_bits=4, strategy=None, group_size=None, actorder=None, symmetric=True
+):
     config_groups = {
         "group_1": QuantizationScheme(
             targets=["Linear"],
@@ -46,6 +48,7 @@ def get_dummy_quant_config(num_bits=4, strategy=None, group_size=None, actorder=
                 strategy=strategy,
                 group_size=group_size,
                 actorder=actorder,
+                symmetric=symmetric,
             ),
         ),
     }
@@ -151,21 +154,25 @@ def test_reload_match(tmp_path, num_bits):
         "dummy2.weight_zero_point": torch.tensor(15, dtype=torch.int8),
     }
 
+    # pack-compressor only needs the number of bits from the quant-args to decompress
+    # all other information is extracted from the compressed data directly
     names_to_scheme = {
         "dummy": QuantizationArgs(num_bits=num_bits),
         "dummy2": QuantizationArgs(num_bits=num_bits),
     }
-    quant_config = get_dummy_quant_config(num_bits)
+    quant_config = get_dummy_quant_config(num_bits, symmetric=False)
 
     compressor = PackedQuantizationCompressor(config=quant_config)
     quantized_modules_to_args = {
         "dummy": quant_config.config_groups["group_1"].weights,
         "dummy2": quant_config.config_groups["group_1"].weights,
     }
+
     compressed_state_dict = compressor.compress(
         dense_state_dict, names_to_scheme=quantized_modules_to_args
     )
     save_file(compressed_state_dict, tmp_path / "model.safetensors")
+
     reconstructed_dense_gen = compressor.decompress(
         tmp_path, names_to_scheme=names_to_scheme
     )