Skip to content

Commit 2ddf836

Browse files
committed
Merge branch 'graphcodebert' of github.com:yangzhou6666/attack-pretrain-models-of-code into graphcodebert
2 parents ec5c18a + f794903 commit 2ddf836

File tree

32 files changed

+820
-698
lines changed

32 files changed

+820
-698
lines changed

.gitignore

+2
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,5 @@ CodeXGLUE/Clone-detection-BigCloneBench/dataset/cached_test_sampled_3000_3500
9494
CodeXGLUE/Clone-detection-BigCloneBench/dataset/cached_test_sampled_3500_4000
9595
GraphCodeBERT/clonedetection/dataset/cached_test_sampled
9696
GraphCodeBERT/clonedetection/dataset/cached_train_sampled
97+
GraphCodeBERT/clonedetection/dataset/cached_valid_sampled
98+
CodeXGLUE/Defect-detection/code/adv_saved_models/checkpoint-best-acc/model.bin

CodeXGLUE/Authorship-Attribution/README.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,18 @@ We use full train data for fine-tuning. The training cost is 10 mins on 4*P100-1
5252

5353
```shell
5454
cd code
55-
CUDA_VISIBLE_DEVICES=0,2,4,5 python run.py \
55+
CUDA_VISIBLE_DEVICES=4,6 python run.py \
5656
--output_dir=./saved_models/gcjpy \
5757
--model_type=roberta \
5858
--config_name=microsoft/codebert-base \
5959
--model_name_or_path=microsoft/codebert-base \
6060
--tokenizer_name=roberta-base \
6161
--number_labels 66 \
62-
--do_eval \
62+
--do_train \
6363
--train_data_file=../dataset/data_folder/processed_gcjpy/train.txt \
6464
--eval_data_file=../dataset/data_folder/processed_gcjpy/valid.txt \
6565
--test_data_file=../dataset/data_folder/processed_gcjpy/valid.txt \
66-
--epoch 20 \
66+
--epoch 30 \
6767
--block_size 512 \
6868
--train_batch_size 16 \
6969
--eval_batch_size 32 \
@@ -124,7 +124,7 @@ python attack.py \
124124
#### MHM-LS
125125
```shell
126126
cd code
127-
CUDA_VISIBLE_DEVICES=1 python mhm.py \
127+
CUDA_VISIBLE_DEVICES=6 python mhm.py \
128128
--output_dir=./saved_models/gcjpy \
129129
--model_type=roberta \
130130
--number_labels 66 \

CodeXGLUE/Authorship-Attribution/code/attack.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ def main():
209209
with open(codes_file_path) as rf:
210210
for line in rf:
211211
item = json.loads(line.strip())
212-
source_codes.append(item["code"])
212+
source_codes.append(item["code"].replace("\\n", "\n").replace('\"','"'))
213213
substs.append(item["substitutes"])
214214
assert(len(source_codes) == len(eval_dataset) == len(substs))
215215

CodeXGLUE/Authorship-Attribution/code/attacker.py

+73-77
Large diffs are not rendered by default.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import csv
2+
import json
3+
import random
4+
5+
csv.field_size_limit(100000000)
6+
7+
def main():
8+
total_count = 0
9+
greedy_succ = 0
10+
11+
with open("./attack_gi.csv") as rf:
12+
reader = csv.DictReader(rf)
13+
for row in reader:
14+
if not row["Is Success"] == "-4":
15+
total_count += 1
16+
if row["Is Success"] == "1":
17+
greedy_succ += 1
18+
print(greedy_succ)
19+
print(total_count)
20+
print(float(int(greedy_succ))/int(total_count))
21+
22+
if __name__ == "__main__":
23+
main()
24+

CodeXGLUE/Authorship-Attribution/code/mhm.py

+3-9
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ def main():
165165
with open(codes_file_path) as rf:
166166
for line in rf:
167167
item = json.loads(line.strip())
168-
source_codes.append(item["code"])
168+
source_codes.append(item["code"].replace("\\n", "\n").replace('\"','"'))
169169
substs.append(item["substitutes"])
170170
assert(len(source_codes) == len(eval_dataset) == len(substs))
171171

@@ -192,14 +192,8 @@ def main():
192192
for index, example in enumerate(eval_dataset):
193193
code = source_codes[index]
194194
subs = substs[index]
195-
identifiers, code_tokens = get_identifiers(code, lang='python')
196-
code_tokens = [i for i in code_tokens]
197-
processed_code = " ".join(code_tokens)
198195

199-
new_feature = convert_code_to_features(processed_code, tokenizer, example[1].item(), args)
200-
new_dataset = CodeDataset([new_feature])
201-
202-
orig_prob, orig_label = model.get_results(new_dataset, args.eval_batch_size)
196+
orig_prob, orig_label = model.get_results([example], args.eval_batch_size)
203197
orig_prob = orig_prob[0]
204198
orig_label = orig_label[0]
205199
ground_truth = example[1].item()
@@ -234,7 +228,7 @@ def main():
234228
print (" curr succ rate = "+str(n_succ/total_cnt))
235229
print("Query times in this attack: ", model.query - query_times)
236230
print("All Query times: ", model.query)
237-
recoder.writemhm(index, code, _res["prog_length"], " ".join(_res['tokens']), ground_truth, orig_label, _res["new_pred"], _res["is_success"], _res["old_uid"], _res["score_info"], _res["nb_changed_var"], _res["nb_changed_pos"], _res["replace_info"], _res["attack_type"], model.query - query_times, time_cost)
231+
recoder.writemhm(index, code, _res["prog_length"], _res['tokens'], ground_truth, orig_label, _res["new_pred"], _res["is_success"], _res["old_uid"], _res["score_info"], _res["nb_changed_var"], _res["nb_changed_pos"], _res["replace_info"], _res["attack_type"], model.query - query_times, time_cost)
238232
query_times = model.query
239233

240234
if __name__ == "__main__":

CodeXGLUE/Authorship-Attribution/code/run.py

+2
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def __init__(self, tokenizer, args, file_path=None):
104104
self.examples = torch.load(cache_file_path)
105105
with open(code_pairs_file_path, 'rb') as f:
106106
code_files = pickle.load(f)
107+
107108
logger.info("Loading features from cached file %s", cache_file_path)
108109

109110
except:
@@ -112,6 +113,7 @@ def __init__(self, tokenizer, args, file_path=None):
112113
with open(file_path) as f:
113114
for line in f:
114115
code = line.split(" <CODESPLIT> ")[0]
116+
code = code.replace("\\n", "\n").replace('\"','"')
115117
label = line.split(" <CODESPLIT> ")[1]
116118
# 将这俩内容转化成input.
117119
self.examples.append(convert_examples_to_features(code, int(label), tokenizer,args))

CodeXGLUE/Clone-detection-BigCloneBench/code/attacker.py

-1
Original file line numberDiff line numberDiff line change
@@ -378,7 +378,6 @@ def greedy_attack(self, example, substitutes, code):
378378
sorted_list_of_names = sorted(names_to_importance_score.items(), key=lambda x: x[1], reverse=True)
379379
# 根据importance_score进行排序
380380

381-
final_words = copy.deepcopy(words)
382381
final_code = copy.deepcopy(code_1)
383382

384383
nb_changed_var = 0 # 表示被修改的variable数量
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
CUDA_VISIBLE_DEVICES=0 python attack.py \
1+
CUDA_VISIBLE_DEVICES=1 python attack.py \
22
--output_dir=./saved_models \
33
--model_type=roberta \
44
--config_name=microsoft/codebert-base \
@@ -11,10 +11,10 @@ CUDA_VISIBLE_DEVICES=0 python attack.py \
1111
--eval_data_file=../dataset/test_sampled_1500_2000.txt \
1212
--test_data_file=../dataset/test_sampled.txt \
1313
--block_size 512 \
14-
--eval_batch_size 6 \
14+
--eval_batch_size 16 \
1515
--seed 123456 2>&1| tee attack_GA_1500_2000.log &
1616

17-
CUDA_VISIBLE_DEVICES=1 python attack.py \
17+
CUDA_VISIBLE_DEVICES=7 python attack.py \
1818
--output_dir=./saved_models \
1919
--model_type=roberta \
2020
--config_name=microsoft/codebert-base \
@@ -29,51 +29,3 @@ CUDA_VISIBLE_DEVICES=1 python attack.py \
2929
--block_size 512 \
3030
--eval_batch_size 16 \
3131
--seed 123456 2>&1| tee attack_GA_2000_2500.log &
32-
33-
CUDA_VISIBLE_DEVICES=1 python attack.py \
34-
--output_dir=./saved_models \
35-
--model_type=roberta \
36-
--config_name=microsoft/codebert-base \
37-
--csv_store_path ./attack_GA_2500_3000.csv \
38-
--model_name_or_path=microsoft/codebert-base \
39-
--tokenizer_name=roberta-base \
40-
--use_ga \
41-
--base_model=microsoft/codebert-base-mlm \
42-
--train_data_file=../dataset/train_sampled.txt \
43-
--eval_data_file=../dataset/test_sampled_2500_3000.txt \
44-
--test_data_file=../dataset/test_sampled.txt \
45-
--block_size 512 \
46-
--eval_batch_size 16 \
47-
--seed 123456 2>&1| tee attack_GA_2500_3000.log &
48-
49-
CUDA_VISIBLE_DEVICES=0 python attack.py \
50-
--output_dir=./saved_models \
51-
--model_type=roberta \
52-
--config_name=microsoft/codebert-base \
53-
--csv_store_path ./attack_GA_3000_3500.csv \
54-
--model_name_or_path=microsoft/codebert-base \
55-
--tokenizer_name=roberta-base \
56-
--use_ga \
57-
--base_model=microsoft/codebert-base-mlm \
58-
--train_data_file=../dataset/train_sampled.txt \
59-
--eval_data_file=../dataset/test_sampled_3000_3500.txt \
60-
--test_data_file=../dataset/test_sampled.txt \
61-
--block_size 512 \
62-
--eval_batch_size 16 \
63-
--seed 123456 2>&1| tee attack_GA_3000_3500.log &
64-
65-
CUDA_VISIBLE_DEVICES=1 python attack.py \
66-
--output_dir=./saved_models \
67-
--model_type=roberta \
68-
--config_name=microsoft/codebert-base \
69-
--csv_store_path ./attack_GA_3500_4000.csv \
70-
--model_name_or_path=microsoft/codebert-base \
71-
--tokenizer_name=roberta-base \
72-
--use_ga \
73-
--base_model=microsoft/codebert-base-mlm \
74-
--train_data_file=../dataset/train_sampled.txt \
75-
--eval_data_file=../dataset/test_sampled_3500_4000.txt \
76-
--test_data_file=../dataset/test_sampled.txt \
77-
--block_size 512 \
78-
--eval_batch_size 16 \
79-
--seed 123456 2>&1| tee attack_GA_3500_4000.log &
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
import pandas as pd
2+
from random import sample
3+
4+
5+
def random_selection():
6+
fields = ['Index', 'Is Success']
7+
# read specific columns
8+
mhm_path = './results/attack_mhm.csv'
9+
gi_path = './results/attack_genetic.csv'
10+
index_mhm = pd.read_csv(mhm_path, skipinitialspace=True, usecols=fields)
11+
index_gi = pd.read_csv(gi_path, skipinitialspace=True, usecols=fields)
12+
mhm_success = index_mhm[index_mhm['Is Success'] == 1]
13+
gi_success = index_gi[index_gi['Is Success'] == 1]
14+
print(type(gi_success))
15+
intersect = list(set(mhm_success['Index'].values.tolist()).intersection(set(gi_success['Index'].values.tolist())))
16+
print(len(intersect))
17+
# samples = sample(intersect, 100)
18+
#
19+
# print(samples)
20+
# print(len(set(samples)))
21+
# return samples
22+
23+
return intersect
24+
25+
def filter_csv(index):
26+
mhm_path = './results/attack_mhm.csv'
27+
gi_path = './results/attack_genetic.csv'
28+
index_mhm = pd.read_csv(mhm_path)
29+
index_gi = pd.read_csv(gi_path)
30+
31+
mhm = index_mhm.loc[index_mhm['Index'].isin(index)]
32+
gi = index_gi.loc[index_gi['Index'].isin(index)]
33+
34+
data = [gi["Index"], gi["Original Code"], gi["Adversarial Code"], gi["Extracted Names"], gi["Replaced Names"],
35+
mhm["Adversarial Code"], mhm["Extracted Names"], mhm["Replaced Names"],]
36+
37+
headers = ["Index", "Original", "GA_Adversarial Code", "GA_Extracted Names", "GA_Replaced Names",
38+
"mhm_Adversarial Code", "mhm_Extracted Names", "mhm_Replaced Names",]
39+
gi.to_csv('gi.csv', index=False)
40+
mhm.to_csv('mhm.csv', index=False)
41+
42+
print(mhm)
43+
df3 = pd.concat(data, axis=1, keys=headers)
44+
df3.to_csv('total.csv', index=False)
45+
46+
print(df3)
47+
48+
def write_attack_files(index):
49+
f_original = open("original.txt", "w")
50+
f_mhm = open("mhm_attack.txt", "w")
51+
f_ga = open("ga_attack.txt", "w")
52+
53+
54+
def main():
55+
indexes = random_selection()
56+
filter_csv(indexes)
57+
58+
if __name__ == '__main__':
59+
main()

CodeXGLUE/Defect-detection/README.md

+16-16
Original file line numberDiff line numberDiff line change
@@ -79,18 +79,18 @@ We use full train data for fine-tuning. The training cost is 50 mins on 8*P100-1
7979

8080
```shell
8181
cd code
82-
python run.py \
83-
--output_dir=./saved_models \
82+
CUDA_VISIBLE_DEVICES=4,6 python run.py \
83+
--output_dir=./adv_saved_models \
8484
--model_type=roberta \
8585
--tokenizer_name=microsoft/codebert-base \
8686
--model_name_or_path=microsoft/codebert-base \
8787
--do_train \
88-
--train_data_file=../preprocess/dataset/train.jsonl \
88+
--train_data_file=../preprocess/dataset/adv_train.jsonl \
8989
--eval_data_file=../preprocess/dataset/valid.jsonl \
9090
--test_data_file=../preprocess/dataset/test.jsonl \
9191
--epoch 5 \
9292
--block_size 512 \
93-
--train_batch_size 32 \
93+
--train_batch_size 24 \
9494
--eval_batch_size 64 \
9595
--learning_rate 2e-5 \
9696
--max_grad_norm 1.0 \
@@ -105,14 +105,14 @@ We use full valid data to evaluate. The inferencing cost is 1 min on 8*P100-16G.
105105
```shell
106106
cd code
107107
CUDA_VISIBLE_DEVICES=6 python run.py \
108-
--output_dir=./adv_saved_models \
108+
--output_dir=./saved_models \
109109
--model_type=roberta \
110110
--tokenizer_name=microsoft/codebert-base \
111111
--model_name_or_path=microsoft/codebert-base \
112112
--do_test \
113113
--train_data_file=../preprocess/dataset/train.jsonl \
114114
--eval_data_file=../preprocess/dataset/valid.jsonl \
115-
--test_data_file=../preprocess/dataset/test.jsonl \
115+
--test_data_file=../preprocess/dataset/adv_test.jsonl \
116116
--epoch 5 \
117117
--block_size 512 \
118118
--train_batch_size 32 \
@@ -126,11 +126,11 @@ CUDA_VISIBLE_DEVICES=6 python run.py \
126126
## Attack
127127

128128
If you don't want to be bothered by fine-tuning models, you can download the victim model into `code/saved_models/checkpoint-best-acc` by [this link](https://drive.google.com/file/d/14STf95S3cDstI5CiyvK1giLlbDw4ZThu/view?usp=sharing).
129-
129+
ADV: https://drive.google.com/file/d/1CR3SWBlyMZLnctZklAHMFf0Jq1U7YdsZ/view?usp=sharing
130130
```shell
131131
pip install gdown
132132
mkdir -p code/saved_models/checkpoint-best-acc
133-
gdown https://drive.google.com/uc?id=14STf95S3cDstI5CiyvK1giLlbDw4ZThu
133+
gdown https://drive.google.com/uc?id=1CR3SWBlyMZLnctZklAHMFf0Jq1U7YdsZ
134134
mv model.bin code/saved_models/checkpoint-best-acc/
135135
```
136136

@@ -227,19 +227,19 @@ python get_substitutes.py \
227227
### Attack microsoft/codebert-base-mlm
228228
```shell
229229
cd code
230-
CUDA_VISIBLE_DEVICES=0 python gi_attack.py \
231-
--output_dir=./saved_models \
230+
CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
231+
--output_dir=./adv_saved_models \
232232
--model_type=roberta \
233233
--tokenizer_name=microsoft/codebert-base-mlm \
234234
--model_name_or_path=microsoft/codebert-base-mlm \
235-
--csv_store_path ./attack_no_gi.csv \
235+
--csv_store_path ./attack_no_gitest_subs_400_800_.csv \
236236
--base_model=microsoft/codebert-base-mlm \
237237
--train_data_file=../preprocess/dataset/train_subs.jsonl \
238-
--eval_data_file=../preprocess/dataset/valid_subs.jsonl \
238+
--eval_data_file=../preprocess/dataset/test_subs_400_800.jsonl \
239239
--test_data_file=../preprocess/dataset/test_subs.jsonl \
240240
--block_size 512 \
241241
--eval_batch_size 64 \
242-
--seed 123456 2>&1 | tee attack_no_gi.log
242+
--seed 123456 2>&1 | tee attack_no_gitest_subs_400_800_.log
243243
```
244244

245245
# Genetic Programming
@@ -255,7 +255,7 @@ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
255255
--base_model=microsoft/codebert-base-mlm \
256256
--use_ga \
257257
--train_data_file=../preprocess/dataset/train_subs.jsonl \
258-
--eval_data_file=../preprocess/dataset/valid_subs.jsonl \
258+
--eval_data_file=../preprocess/dataset/test_subs_0_400.jsonl \
259259
--test_data_file=../preprocess/dataset/test_subs.jsonl \
260260
--block_size 512 \
261261
--eval_batch_size 64 \
@@ -273,15 +273,15 @@ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
273273
# MHM-Attack
274274
```shell
275275
cd code
276-
CUDA_VISIBLE_DEVICES=6 python mhm_attack.py \
276+
CUDA_VISIBLE_DEVICES=2 python mhm_attack.py \
277277
--output_dir=./saved_models \
278278
--model_type=roberta \
279279
--tokenizer_name=microsoft/codebert-base \
280280
--model_name_or_path=microsoft/codebert-base \
281281
--csv_store_path ./attack_mhm_ls.csv \
282282
--base_model=microsoft/codebert-base-mlm \
283283
--train_data_file=../preprocess/dataset/train_subs.jsonl \
284-
--eval_data_file=../preprocess/dataset/valid_subs.jsonl \
284+
--eval_data_file=../preprocess/dataset/test_subs_0_400.jsonl \
285285
--test_data_file=../preprocess/dataset/test_subs.jsonl \
286286
--block_size 512 \
287287
--eval_batch_size 64 \

CodeXGLUE/Defect-detection/code/run.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def convert_examples_to_features(js,tokenizer,args):
9292
source_ids = tokenizer.convert_tokens_to_ids(source_tokens)
9393
padding_length = args.block_size - len(source_ids)
9494
source_ids+=[tokenizer.pad_token_id]*padding_length
95-
return InputFeatures(source_tokens,source_ids,js['idx'],js['target'])
95+
return InputFeatures(source_tokens,source_ids,js['idx'],int(js['target']))
9696

9797
class TextDataset(Dataset):
9898
def __init__(self, tokenizer, args, file_path=None):
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import csv
2+
import json
3+
import random
4+
from tqdm import tqdm
5+
csv.field_size_limit(100000000)
6+
adv_data = []
7+
for index in [(0,400), (400,800), (800,1200), (1200,1600), (1600,2000), (2000,2400), (2400,2800)]:
8+
with open("../code/attack_genetic_test_subs_"+str(index[0])+"_"+str(index[1])+".csv") as rf:
9+
reader = csv.DictReader(rf)
10+
for row in reader:
11+
if not len(row["Adversarial Code"]) == 0:
12+
adv_data.append({"target":int(row["True Label"]), "func":row["Adversarial Code"], "idx":None})
13+
print(len(adv_data))
14+
# with open("./dataset/train.jsonl") as rf:
15+
# for line in rf:
16+
# adv_data.append(json.loads(line.strip()))
17+
# print(len(adv_data))
18+
random.shuffle(adv_data)
19+
20+
with open("./dataset/adv_test.jsonl", "w") as wf:
21+
for item in tqdm(adv_data):
22+
wf.write(json.dumps(item)+'\n')

0 commit comments

Comments
 (0)