ModelTC · Charles2530 · Mar 10, 2026 · Mar 11, 2026 · Mar 13, 2026 · Mar 15, 2026
diff --git a/.gitignore b/.gitignore
@@ -23,4 +23,4 @@ save*
 *.pid
 *.ipynb*
 .venv/
-*.sh
+*.sh
diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_fp8_wikitext.yml
@@ -0,0 +1,33 @@
+base:
+    seed: &seed 42
+model:
+    type: IndustrialCoder
+    path: model/IndustrialCoder-32B
+    tokenizer_mode: slow
+    torch_dtype: auto
+    # Reduce peak memory in catcher stage for large models.
+    use_cpu_to_save_cuda_mem_for_catcher: False
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: True
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: RTN
+    weight:
+        quant_type: float-quant
+        bit: e4m3
+        symmetric: True
+        granularity: per_channel
+        use_qtorch: True
+    act:
+        quant_type: float-quant
+        bit: e4m3
+        symmetric: True
+        granularity: per_token
+        use_qtorch: True
+save:
+    save_vllm: True
+    save_path: ./save_for_vllm/industrialcoder_rtn_fp8_wikitext/
diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_awq_wikitext.yml
@@ -0,0 +1,41 @@
+base:
+    seed: &seed 42
+model:
+    type: IndustrialCoder
+    path: model/IndustrialCoder-32B
+    tokenizer_mode: slow
+    torch_dtype: auto
+    # Reduce peak memory in catcher stage for large models.
+    use_cpu_to_save_cuda_mem_for_catcher: False
+calib:
+    name: pileval
+    download: True
+    # path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: txt_general_preproc
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: True
+    seq_len: 2048
+    bs: 20
+    inference_per_block: True
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+        need_pack: True
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: True
+    quant_out: True
+save:
+    save_vllm: True
+    save_path: ./save_for_vllm/industrialcoder_rtn_int_awq_wikitext/
diff --git a/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml b/configs/quantization/backend/vllm/fp8/industrialcoder_rtn_int_gptq_wikitext.yml
@@ -0,0 +1,43 @@
+base:
+    seed: &seed 42
+model:
+    type: IndustrialCoder
+    path: model/IndustrialCoder-32B
+    tokenizer_mode: slow
+    torch_dtype: auto
+    # Reduce peak memory in catcher stage for large models.
+    use_cpu_to_save_cuda_mem_for_catcher: False
+calib:
+    name: wikitext2
+    download: True
+    n_samples: 128
+    # path: calib data path
+    bs: 1
+    seq_len: 2048
+    preproc: wikitext2_gptq
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: True
+    seq_len: 2048
+    bs: 20
+    inference_per_block: True
+quant:
+    method: GPTQ
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+        need_pack: True
+    special:
+        actorder: True
+        static_groups: True
+        percdamp: 0.01
+        blocksize: 128
+        true_sequential: True
+    quant_out: True
+save:
+    save_vllm: True
+    save_path: ./save_for_vllm/industrialcoder_rtn_int_gptq_wikitext/
diff --git a/configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml b/configs/quantization/backend/vllm/fp8/thinking_model_fp8.yml
@@ -0,0 +1,33 @@
+base:
+    seed: &seed 42
+model:
+    type: IndustrialCoder
+    path: /mnt/lm_data_afs/wangzining/charles/industrial_thinking/IndustrialCoder-Thinking
+    tokenizer_mode: slow
+    torch_dtype: auto
+    # Reduce peak memory in catcher stage for large models.
+    use_cpu_to_save_cuda_mem_for_catcher: False
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: True
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: RTN
+    weight:
+        quant_type: float-quant
+        bit: e4m3
+        symmetric: True
+        granularity: per_channel
+        use_qtorch: True
+    act:
+        quant_type: float-quant
+        bit: e4m3
+        symmetric: True
+        granularity: per_token
+        use_qtorch: True
+save:
+    save_vllm: True
+    save_path: ./save_for_vllm/thinking_rtn_fp8_wikitext/
diff --git a/configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml b/configs/quantization/backend/vllm/thinkingmodel/gptq_w4a16.yml
@@ -0,0 +1,41 @@
+base:
+    seed: &seed 0
+model:
+    type: IndustrialCoder
+    path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/thinkingmodel/IndustrialCoder-Thinking
+    torch_dtype: auto
+calib:
+    name: wikitext2
+    download: True
+    n_samples: 128
+    # path: calib data path
+    bs: 1
+    seq_len: 2048
+    preproc: wikitext2_gptq
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: True
+    # path: eval data path
+    bs: 1
+    seq_len: 2048
+    inference_per_block: False
+quant:
+    method: GPTQ
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+        need_pack: True
+    special:
+        actorder: True
+        static_groups: True
+        percdamp: 0.01
+        blocksize: 128
+        true_sequential: True
+    quant_out: True
+save:
+    save_vllm: True
+    save_path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/save_for_vllm/thinking_gptq_w4/
diff --git a/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml b/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16.yml
@@ -0,0 +1,43 @@
+base:
+    seed: &seed 42
+model:
+    type: IndustrialCoder
+    path: /mnt/lm_data_afs/wangzining/charles/industrial_thinking/IndustrialCoder-Thinking
+    tokenizer_mode: slow
+    torch_dtype: auto
+
+    use_cpu_to_save_cuda_mem_for_catcher: False 
+calib:
+    name: wikitext2  
+    download: True
+    # path: /mnt/lm_data_afs/wangzining/charles/datasets/wikitext2  
+    n_samples: 128
+    bs: 1           
+    seq_len: 512
+    preproc: txt_general_preproc
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: True
+    # path: /mnt/lm_data_afs/wangzining/charles/datasets/wikitext2  
+    seq_len: 2048
+    bs: 1
+    inference_per_block: False
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+        need_pack: True
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: True
+    quant_out: True
+save:
+    save_vllm: True
+
+    save_path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/save_for_vllm/thinking_awq_w4/
diff --git a/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16_2.yml b/configs/quantization/backend/vllm/thinkingmodel/thinkingmodel_awq_w4a16_2.yml
@@ -0,0 +1,42 @@
+base:
+    seed: &seed 42
+model:
+    type: IndustrialCoder
+    path: /mnt/lm_data_afs/wangzining/charles/industrial_thinking/IndustrialCoder-Thinking
+    tokenizer_mode: slow
+    torch_dtype: auto
+calib:
+    name: pileval
+    download: False
+    path: calib data path
+    n_samples: 128
+    bs: -1
+    seq_len: 512
+    preproc: txt_general_preproc
+    seed: *seed
+eval:
+    eval_pos: [fake_quant]
+    name: wikitext2
+    download: False
+    path: eval data path
+    seq_len: 2048
+    # For 7B / 13B model eval, bs can be set to "1", and inference_per_block can be set to "False".
+    # For 70B model eval, bs can be set to "20", and inference_per_block can be set to "True".
+    bs: 1
+    inference_per_block: False
+quant:
+    method: Awq
+    weight:
+        bit: 4
+        symmetric: True
+        granularity: per_group
+        group_size: 128
+        need_pack: True
+    special:
+        trans: True
+        trans_version: v2
+        weight_clip: True
+    quant_out: True
+save:
+    save_vllm: True
+    save_path: /path/to/save_for_vllm_awq_w4/
diff --git a/configs/quantization/video_gen/wan2_2_t2v/awq_w_a.yaml b/configs/quantization/video_gen/wan2_2_t2v/awq_w_a.yaml
@@ -0,0 +1,54 @@
+base:
+    seed: &seed 42
+model:
+    type: Wan2T2V
+    path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/model/Wan2.2-T2V-A14B-Diffusers
+    torch_dtype: auto
+calib:
+    name: t2v
+    download: False
+    path: ./assets/wan_t2v/calib/
+    sample_steps: 20  
+    bs: 1
+    target_height: 480 
+    target_width: 832
+    num_frames: 81  
+    guidance_scale: 5.0
+    seed: *seed
+eval:
+    eval_pos: [transformed, fake_quant]
+    type: video_gen
+    name: t2v
+    download: False
+    path: ./assets/wan_t2v/calib/
+    bs: 1
+    target_height: 480
+    target_width: 832
+    num_frames: 81
+    guidance_scale: 5.0
+    output_video_path: ./output_videos_awq/
+    inference_per_block: True
+quant:
+    video_gen:
+        method: Awq
+        weight:
+            # quant_type: int-quant
+            quant_type: hif4
+            bit: 4
+            symmetric: True
+            granularity: per_channel
+            group_size: -1
+        act:
+            # quant_type: int-qu
+            quant_type: hif4
+            bit: 4
+            symmetric: True
+            granularity: per_token
+        special:
+            trans: True
+            trans_version: v2
+            weight_clip: True
+            clip_sym: True
+save:
+    save_lightx2v: True
+    save_path: ./save_for_lightx2v/wan2_2_t2v/awq_w_a/original/
diff --git a/configs/quantization/video_gen/wan_i2v/awq_w_a.yaml b/configs/quantization/video_gen/wan_i2v/awq_w_a.yaml
@@ -2,7 +2,7 @@ base:
     seed: &seed 42
 model:
     type: WanI2V
-    path: /path/to/model
+    path: /mnt/lm_data_afs/wangzining/charles/lab/llmc/models/Wan2.2-T2V-A14B/
     torch_dtype: auto
 calib:
     name: i2v
@@ -31,12 +31,12 @@ quant:
     video_gen:
         method: Awq
         weight:
-            bit: 8
+            bit: 4
             symmetric: True
             granularity: per_channel
             group_size: -1
         act:
-            bit: 8
+            bit: 4
             symmetric: True
             granularity: per_token
         special:
@@ -46,4 +46,4 @@ quant:
             clip_sym: True
 save:
     save_lightx2v: True
-    save_path: /path/to/x2v/
+    save_path: ../lightx2v/wan_i2v_awq_w_a/x2v/
-Original file line number
+Diff line change
@@ Expand Up / @@ -23,4 +23,4 @@ save* @@
     *.pid
     *.ipynb*
     .venv/
-    *.sh
+    *.sh