@@ -79,18 +79,18 @@ We use full train data for fine-tuning. The training cost is 50 mins on 8*P100-1
79
79
80
80
``` shell
81
81
cd code
82
- python run.py \
83
- --output_dir=./saved_models \
82
+ CUDA_VISIBLE_DEVICES=4,6 python run.py \
83
+ --output_dir=./adv_saved_models \
84
84
--model_type=roberta \
85
85
--tokenizer_name=microsoft/codebert-base \
86
86
--model_name_or_path=microsoft/codebert-base \
87
87
--do_train \
88
- --train_data_file=../preprocess/dataset/train .jsonl \
88
+ --train_data_file=../preprocess/dataset/adv_train .jsonl \
89
89
--eval_data_file=../preprocess/dataset/valid.jsonl \
90
90
--test_data_file=../preprocess/dataset/test.jsonl \
91
91
--epoch 5 \
92
92
--block_size 512 \
93
- --train_batch_size 32 \
93
+ --train_batch_size 24 \
94
94
--eval_batch_size 64 \
95
95
--learning_rate 2e-5 \
96
96
--max_grad_norm 1.0 \
@@ -105,14 +105,14 @@ We use full valid data to evaluate. The inferencing cost is 1 min on 8*P100-16G.
105
105
``` shell
106
106
cd code
107
107
CUDA_VISIBLE_DEVICES=6 python run.py \
108
- --output_dir=./adv_saved_models \
108
+ --output_dir=./saved_models \
109
109
--model_type=roberta \
110
110
--tokenizer_name=microsoft/codebert-base \
111
111
--model_name_or_path=microsoft/codebert-base \
112
112
--do_test \
113
113
--train_data_file=../preprocess/dataset/train.jsonl \
114
114
--eval_data_file=../preprocess/dataset/valid.jsonl \
115
- --test_data_file=../preprocess/dataset/test .jsonl \
115
+ --test_data_file=../preprocess/dataset/adv_test .jsonl \
116
116
--epoch 5 \
117
117
--block_size 512 \
118
118
--train_batch_size 32 \
@@ -126,11 +126,11 @@ CUDA_VISIBLE_DEVICES=6 python run.py \
126
126
## Attack
127
127
128
128
If you don't want to be bothered by fine-tuning models, you can download the victim model into ` code/saved_models/checkpoint-best-acc ` by [ this link] ( https://drive.google.com/file/d/14STf95S3cDstI5CiyvK1giLlbDw4ZThu/view?usp=sharing ) .
129
-
129
+ ADV: https://drive.google.com/file/d/1CR3SWBlyMZLnctZklAHMFf0Jq1U7YdsZ/view?usp=sharing
130
130
``` shell
131
131
pip install gdown
132
132
mkdir -p code/saved_models/checkpoint-best-acc
133
- gdown https://drive.google.com/uc? id=14STf95S3cDstI5CiyvK1giLlbDw4ZThu
133
+ gdown https://drive.google.com/uc? id=1CR3SWBlyMZLnctZklAHMFf0Jq1U7YdsZ
134
134
mv model.bin code/saved_models/checkpoint-best-acc/
135
135
```
136
136
@@ -227,19 +227,19 @@ python get_substitutes.py \
227
227
### Attack microsoft/codebert-base-mlm
228
228
``` shell
229
229
cd code
230
- CUDA_VISIBLE_DEVICES=0 python gi_attack.py \
231
- --output_dir=./saved_models \
230
+ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
231
+ --output_dir=./adv_saved_models \
232
232
--model_type=roberta \
233
233
--tokenizer_name=microsoft/codebert-base-mlm \
234
234
--model_name_or_path=microsoft/codebert-base-mlm \
235
- --csv_store_path ./attack_no_gi .csv \
235
+ --csv_store_path ./attack_no_gitest_subs_400_800_ .csv \
236
236
--base_model=microsoft/codebert-base-mlm \
237
237
--train_data_file=../preprocess/dataset/train_subs.jsonl \
238
- --eval_data_file=../preprocess/dataset/valid_subs .jsonl \
238
+ --eval_data_file=../preprocess/dataset/test_subs_400_800 .jsonl \
239
239
--test_data_file=../preprocess/dataset/test_subs.jsonl \
240
240
--block_size 512 \
241
241
--eval_batch_size 64 \
242
- --seed 123456 2>&1 | tee attack_no_gi .log
242
+ --seed 123456 2>&1 | tee attack_no_gitest_subs_400_800_ .log
243
243
```
244
244
245
245
# Genetic Programming
@@ -255,7 +255,7 @@ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
255
255
--base_model=microsoft/codebert-base-mlm \
256
256
--use_ga \
257
257
--train_data_file=../preprocess/dataset/train_subs.jsonl \
258
- --eval_data_file=../preprocess/dataset/valid_subs .jsonl \
258
+ --eval_data_file=../preprocess/dataset/test_subs_0_400 .jsonl \
259
259
--test_data_file=../preprocess/dataset/test_subs.jsonl \
260
260
--block_size 512 \
261
261
--eval_batch_size 64 \
@@ -273,15 +273,15 @@ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
273
273
# MHM-Attack
274
274
``` shell
275
275
cd code
276
- CUDA_VISIBLE_DEVICES=6 python mhm_attack.py \
276
+ CUDA_VISIBLE_DEVICES=2 python mhm_attack.py \
277
277
--output_dir=./saved_models \
278
278
--model_type=roberta \
279
279
--tokenizer_name=microsoft/codebert-base \
280
280
--model_name_or_path=microsoft/codebert-base \
281
281
--csv_store_path ./attack_mhm_ls.csv \
282
282
--base_model=microsoft/codebert-base-mlm \
283
283
--train_data_file=../preprocess/dataset/train_subs.jsonl \
284
- --eval_data_file=../preprocess/dataset/valid_subs .jsonl \
284
+ --eval_data_file=../preprocess/dataset/test_subs_0_400 .jsonl \
285
285
--test_data_file=../preprocess/dataset/test_subs.jsonl \
286
286
--block_size 512 \
287
287
--eval_batch_size 64 \
0 commit comments