ModelTC · llmc-reviewer · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/config.yml b/config.yml
@@ -0,0 +1,46 @@
+base:
+    seed: &seed 42
+model:
+    type: Qwen2
+    path: /home/gushiqiao/nvme/gushiqiao/bussinesss/code_72b/SenseChat-Code-Tmp
+    tokenizer_mode: fast
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: /home/gushiqiao/nvme/gushiqiao/llm_datasets/calib/pileval
+    n_samples: 256
+    bs: -1
+    seq_len: 512
+    preproc: txt_general_preproc
+    seed: *seed
+# eval:
+#     - eval_pos: [ fake_quant]
+#       name: wikitext2
+#       download: False
+#       path: /home/gushiqiao/nvme/gushiqiao/llm_datasets/eval/wikitext2
+#       seq_len: 2048
+#       # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+#       # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+#       bs: 10
+#       inference_per_block: True
+quant:
+    method: Awq
+    weight:
+        bit: 8
+        symmetric: True
+        granularity: per_channel
+        group_size: -1
+    act:
+        bit: 8
+        symmetric: True
+        granularity: per_token
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: False
+        awq_bs: 128
+    quant_out: True
+save:
+    save_trans: True
+    save_path: ./awq_test_new_pileval_down_ov/
diff --git a/configs/quantization/backend/sglang/fp8/awq_fp8.yml b/configs/quantization/backend/sglang/fp8/awq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml b/configs/quantization/backend/sglang/fp8/awq_fp8_static.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_tensor
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/sglang/fp8/gptq_fp8.yml b/configs/quantization/backend/sglang/fp8/gptq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: GPTQ
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/sglang/fp8/rtn_fp8.yml b/configs/quantization/backend/sglang/fp8/rtn_fp8.yml
@@ -17,13 +17,14 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_token

diff --git a/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml b/configs/quantization/backend/sglang/fp8/smoothquant_fp8.yml
@@ -22,14 +22,15 @@ eval:
     seq_len: 2048
 quant:
     method: SmoothQuant
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/vllm/fp8/awq_fp8.yml b/configs/quantization/backend/vllm/fp8/awq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml b/configs/quantization/backend/vllm/fp8/awq_fp8_static.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_tensor
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/vllm/fp8/gptq_fp8.yml b/configs/quantization/backend/vllm/fp8/gptq_fp8.yml
@@ -26,14 +26,15 @@ eval:
     inference_per_block: False
 quant:
     method: GPTQ
-    quant_type: float_quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/backend/vllm/fp8/rtn_fp8.yml b/configs/quantization/backend/vllm/fp8/rtn_fp8.yml
@@ -17,13 +17,14 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_token

diff --git a/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml b/configs/quantization/backend/vllm/fp8/smoothquant_fp8.yml
@@ -22,14 +22,15 @@ eval:
     seq_len: 2048
 quant:
     method: SmoothQuant
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True
         granularity: per_channel
         use_qtorch: True
     act:
+        quant_type: float-quant
         # Support ["e4m3", "e5m2"]
         bit: e4m3
         symmetric: True

diff --git a/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/awq_we2m1a16_g128.yml
@@ -25,8 +25,8 @@ eval:
     inference_per_block: False
 quant:
     method: Awq
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: False
         granularity: per_group

diff --git a/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/gptq_we2m1a16_g128.yml
@@ -26,8 +26,8 @@ eval:
     inference_per_block: False
 quant:
     method: GPTQ
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_group

diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1a16_g128.yml
@@ -16,8 +16,8 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_group

diff --git a/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml b/configs/quantization/methods/FP_Quant/rtn_we2m1ae2m1.yml
@@ -16,12 +16,13 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_channel
     act:
+        quant_type: float-quant
         bit: e2m1
         symmetric: True
         granularity: per_token

diff --git a/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml b/configs/quantization/methods/FP_Quant/rtn_we4m3ae4m3.yml
@@ -16,12 +16,13 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_channel
     act:
+        quant_type: float-quant
         bit: e4m3
         symmetric: True
         granularity: per_token

diff --git a/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml b/configs/quantization/methods/FP_Quant/rtn_we5m2ae5m2.yml
@@ -16,12 +16,13 @@ eval:
     inference_per_block: False
 quant:
     method: RTN
-    quant_type: float-quant
     weight:
+        quant_type: float-quant
         bit: e5m2
         symmetric: True
         granularity: per_channel
     act:
+        quant_type: float-quant
         bit: e5m2
         symmetric: True
         granularity: per_token

diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_kivi_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_kivi_quant_kv.yml
@@ -5,14 +5,14 @@ model:
     path: model path
     torch_dtype: auto
 eval:
-    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
+    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #decode_ppl eval not support pretrain eval pos
     name: wikitext2
     type: decode_ppl
     download: False
     path: eval_data_path
     bs: 1
     inference_per_block: False
-    num_samples: 10
+    num_samples: 50
     # num_eval_tokens: 3
 quant:
     method: RTN

diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_naive_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_naive_quant_kv.yml
@@ -5,14 +5,14 @@ model:
     path: model path
     torch_dtype: auto
 eval:
-    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #long_ppl eval not support pretrain eval pos
+    eval_pos: [transformed, fake_quant, fake_quant_wo_kv] #decode_ppl eval not support pretrain eval pos
     name: wikitext2
     type: decode_ppl
     download: False
     path: eval_data_path
     bs: 1
     inference_per_block: False
-    num_samples: 10
+    num_samples: 50
     # num_eval_tokens: 3
 quant:
     method: RTN

diff --git a/configs/quantization/methods/KVQuant/rtn_w_a_sink_quant_kv.yml b/configs/quantization/methods/KVQuant/rtn_w_a_sink_quant_kv.yml
diff --git a/configs/quantization/methods/RTN/rtn_w_a_pertensor_static.yml b/configs/quantization/methods/RTN/rtn_w_a_pertensor_static.yml
@@ -35,6 +35,7 @@ quant:
         symmetric: True
         granularity: per_tensor
         static: True
+        calib_algo: static_hist
 save:
     save_fake: False
     save_path: /path/to/save/