[not4land] repro dynamo error

Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
pytorch · Oct 18, 2024 · f1d0d67 · f1d0d67
1 parent b56e2ee
commit f1d0d67
Show file tree

Hide file tree

Showing 5 changed files with 81 additions and 16 deletions.
diff --git a/torchbenchmark/util/backends/torchdynamo.py b/torchbenchmark/util/backends/torchdynamo.py
@@ -181,12 +181,11 @@ def apply_torchdynamo_args(
                     ),
                 )
 
+        print("{args.quantization=}")
         if args.quantization:
             import torchao
             from torchao.quantization import (
-                change_linear_weights_to_int4_woqtensors,
-                change_linear_weights_to_int8_dqtensors,
-                change_linear_weights_to_int8_woqtensors,
+                quantize_, int8_weight_only, int4_weight_only, int8_dynamic_activation_int8_weight
             )
 
             torch._dynamo.config.automatic_dynamic_shapes = False
@@ -196,12 +195,52 @@ def apply_torchdynamo_args(
             module, example_inputs = model.get_module()
             if args.quantization == "int8dynamic":
                 torch._inductor.config.force_fuse_int_mm_with_mul = True
-                change_linear_weights_to_int8_dqtensors(module)
+                quantize_(module, int8_dynamic_activation_int8_weight(), set_inductor_config=False)
             elif args.quantization == "int8weightonly":
                 torch._inductor.config.use_mixed_mm = True
-                change_linear_weights_to_int8_woqtensors(module)
+                quantize_(module, int8_weight_only(), set_inductor_config=False)
             elif args.quantization == "int4weightonly":
-                change_linear_weights_to_int4_woqtensors(module)
+                quantize_(module, int4_weight_only(), set_inductor_config=False)
+            if args.quantization == "autoquant":
+                print("module:", type(module))
+
+                torchao.autoquant(module, example_input=example_inputs, manual=True, error_on_unseen=False, set_inductor_config=False)
+                # torchao.autoquant(module, error_on_unseen=False, set_inductor_config=False)
+                if isinstance(example_inputs, dict):
+                    module(**example_inputs)
+                else:
+                    module(*example_inputs)
+
+                module.finalize_autoquant()
+
+                # for n, m in model.named_modules():
+                #     if isinstance(m, torch.nn.Linear):
+                #         print(f"name {n}, weight type:, {type(m.weight.data)}")
+
+                from torchao.quantization.autoquant import AUTOQUANT_CACHE
+                assert len(AUTOQUANT_CACHE)>0, f"Err: found no autoquantizable layers in model {type(module)}, stopping autoquantization"
+
+                # print("autoquant profile")
+                # from torchao.utils import benchmark_model, profiler_runner
+                # model = torch.compile(module, mode="max-autotune")
+                # inputs = example_inputs
+                # benchmark_model(model, 20, inputs)
+                # print("elapsed_time: ", benchmark_model(model, 100, inputs), " milliseconds")
+                # profiler_runner("quant.json.gz", benchmark_model, model, 5, inputs)
+
+            else:
+                unwrap_tensor_subclass(module)
+        # else:
+        #     module, example_inputs = model.get_module()
+        #     # noquant profile
+        #     print("noquant profile")
+        #     from torchao.utils import benchmark_model, profiler_runner
+        #     model = torch.compile(module, mode="max-autotune")
+        #     inputs = example_inputs
+        #     benchmark_model(model, 20, inputs)
+        #     print("elapsed_time: ", benchmark_model(model, 100, inputs), " milliseconds")
+        #     profiler_runner("noquant.json.gz", benchmark_model, model, 5, inputs)
+>>>>>>> Stashed changes
 
         if args.freeze_prepack_weights:
             torch._inductor.config.freezing = True

diff --git a/torchbenchmark/util/experiment/metrics.py b/torchbenchmark/util/experiment/metrics.py
@@ -13,7 +13,7 @@
 from torchbenchmark.util.experiment.instantiator import TorchBenchModelConfig
 from torchbenchmark.util.model import BenchmarkModel
 
-WARMUP_ROUNDS = 10
+WARMUP_ROUNDS = 20
 BENCHMARK_ITERS = 15
 MEMPROF_ITER = 2
 NANOSECONDS_PER_MILLISECONDS = 1_000_000.0
@@ -53,6 +53,12 @@ def get_latencies(
             func()
             t1 = time.time_ns()
         result_summary.append((t1 - t0) / NANOSECONDS_PER_MILLISECONDS)
+
+    # from torchao.utils import benchmark_model, profiler_runner
+    # print("device:", device)
+    # print("elpased:", benchmark_model(func, 100, (), device_type="cuda"))
+    # profiler_runner("quant.json.gz", benchmark_model, func, 5, (), device_type="cuda")
+
     return result_summary
 
 

diff --git a/userbenchmark/dynamo/dynamobench/torchao_backend.py b/userbenchmark/dynamo/dynamobench/torchao_backend.py
@@ -24,6 +24,15 @@ def torchao_optimize_ctx(quantization: str):
     def inner(model_iter_fn: Callable):
         def _torchao_apply(module: torch.nn.Module, example_inputs: Any):
             if getattr(module, "_quantized", None) is None:
+                if quantization == "noquant":
+                    print("noquant run")
+                    from torchao.utils import benchmark_model, profiler_runner
+                    model = torch.compile(module, mode="max-autotune")
+                    inputs = example_inputs
+                    benchmark_model(model, 20, inputs)
+                    print("elapsed_time: ", benchmark_model(model, 100, inputs), " milliseconds")
+                #     profiler_runner("noquant.json.gz", benchmark_model, model, 5, inputs)
+
                 if quantization == "int8dynamic":
                     quantize_(
                         module,
@@ -35,21 +44,33 @@ def _torchao_apply(module: torch.nn.Module, example_inputs: Any):
                 elif quantization == "int4weightonly":
                     quantize_(module, int4_weight_only(), set_inductor_config=False)
                 if quantization == "autoquant":
-                    autoquant(module, error_on_unseen=False, set_inductor_config=False)
+                    autoquant(module, example_input=example_inputs, manual=True, error_on_unseen=False, set_inductor_config=False)
                     if isinstance(example_inputs, dict):
                         module(**example_inputs)
                     else:
                         module(*example_inputs)
+                    module.finalize_autoquant()
+
                     from torchao.quantization.autoquant import AUTOQUANT_CACHE
 
                     if len(AUTOQUANT_CACHE) == 0:
                         raise Exception(  # noqa: TRY002`
                             "NotAutoquantizable"
                             f"Found no autoquantizable layers in model {type(module)}, stopping autoquantized run"
                         )
+
+                    print("autoquant run")
+                    from torchao.utils import benchmark_model, profiler_runner
+                    model = torch.compile(module, mode="max-autotune")
+                    inputs = example_inputs
+                    benchmark_model(model, 20, inputs)
+                    print("elapsed_time: ", benchmark_model(model, 100, inputs), " milliseconds")
+                    # profiler_runner("quant.json.gz", benchmark_model, model, 5, inputs)
                 else:
                     unwrap_tensor_subclass(module)
                 setattr(module, "_quantized", True)  # noqa: B010
+
+
             model_iter_fn(module, example_inputs)
 
         return _torchao_apply

diff --git a/userbenchmark/group_bench/configs/torch_ao.yaml b/userbenchmark/group_bench/configs/torch_ao.yaml
@@ -1,7 +1,4 @@
-model: "*"
-extended_models:
-  - huggingface
-  - timm
+model: "resnet18"
 test: eval
 device: cuda
 extra_args: --precision bf16 --torchdynamo inductor --inductor-compile-mode max-autotune
@@ -10,7 +7,7 @@ metrics:
 test_group:
   test_batch_size_default:
     subgroup:
-      - extra_args:
-      - extra_args: --quantization int8dynamic
-      - extra_args: --quantization int8weightonly
-      - extra_args: --quantization int4weightonly
+      - extra_args: --quantization autoquant
+      - extra_args: --quantization noquant      
+
+
diff --git a/userbenchmark/torchao/run.py b/userbenchmark/torchao/run.py
@@ -92,6 +92,8 @@ def run(args: List[str]):
     else:
         benchmark_args = [pt2_args]
 
+    print("benchmark args:", benchmark_args)
+
     output_files = [_run_pt2_args(args) for args in benchmark_args]
     # Post-processing
     if args.dashboard: