soarsmu
diff --git a/‎.gitignore
+2 b/‎.gitignore
+2
diff --git a/‎CodeXGLUE/Authorship-Attribution/README.md
+4-4 b/‎CodeXGLUE/Authorship-Attribution/README.md
+4-4
diff --git a/‎CodeXGLUE/Authorship-Attribution/code/attack.py
+1-1 b/‎CodeXGLUE/Authorship-Attribution/code/attack.py
+1-1
diff --git a/‎CodeXGLUE/Authorship-Attribution/code/attacker.py
+73-77 b/‎CodeXGLUE/Authorship-Attribution/code/attacker.py
+73-77
diff --git a/‎CodeXGLUE/Authorship-Attribution/code/get_res.py
+24 b/‎CodeXGLUE/Authorship-Attribution/code/get_res.py
+24
diff --git a/‎CodeXGLUE/Authorship-Attribution/code/mhm.py
+3-9 b/‎CodeXGLUE/Authorship-Attribution/code/mhm.py
+3-9
diff --git a/‎CodeXGLUE/Authorship-Attribution/code/run.py
+2 b/‎CodeXGLUE/Authorship-Attribution/code/run.py
+2
diff --git a/‎CodeXGLUE/Clone-detection-BigCloneBench/code/attacker.py
-1 b/‎CodeXGLUE/Clone-detection-BigCloneBench/code/attacker.py
-1
diff --git a/‎CodeXGLUE/Clone-detection-BigCloneBench/code/run_ga.sh
+3-51 b/‎CodeXGLUE/Clone-detection-BigCloneBench/code/run_ga.sh
+3-51
diff --git a/‎CodeXGLUE/Clone-detection-BigCloneBench/dataset/get_adv_data.py
+59 b/‎CodeXGLUE/Clone-detection-BigCloneBench/dataset/get_adv_data.py
+59
diff --git a/‎CodeXGLUE/Defect-detection/README.md
+16-16 b/‎CodeXGLUE/Defect-detection/README.md
+16-16
diff --git a/‎CodeXGLUE/Defect-detection/code/run.py
+1-1 b/‎CodeXGLUE/Defect-detection/code/run.py
+1-1
diff --git a/‎CodeXGLUE/Defect-detection/preprocess/get_adv_data.py
+22 b/‎CodeXGLUE/Defect-detection/preprocess/get_adv_data.py
+22
@@ -94,3 +94,5 @@ CodeXGLUE/Clone-detection-BigCloneBench/dataset/cached_test_sampled_3000_3500
 CodeXGLUE/Clone-detection-BigCloneBench/dataset/cached_test_sampled_3500_4000
 GraphCodeBERT/clonedetection/dataset/cached_test_sampled
 GraphCodeBERT/clonedetection/dataset/cached_train_sampled
+GraphCodeBERT/clonedetection/dataset/cached_valid_sampled
+CodeXGLUE/Defect-detection/code/adv_saved_models/checkpoint-best-acc/model.bin
@@ -52,18 +52,18 @@ We use full train data for fine-tuning. The training cost is 10 mins on 4*P100-1
 
 ```shell
 cd code
-CUDA_VISIBLE_DEVICES=0,2,4,5 python run.py \
+CUDA_VISIBLE_DEVICES=4,6 python run.py \
     --output_dir=./saved_models/gcjpy \
     --model_type=roberta \
     --config_name=microsoft/codebert-base \
     --model_name_or_path=microsoft/codebert-base \
     --tokenizer_name=roberta-base \
     --number_labels 66 \
-    --do_eval \
+    --do_train \
     --train_data_file=../dataset/data_folder/processed_gcjpy/train.txt \
     --eval_data_file=../dataset/data_folder/processed_gcjpy/valid.txt \
     --test_data_file=../dataset/data_folder/processed_gcjpy/valid.txt \
-    --epoch 20 \
+    --epoch 30 \
     --block_size 512 \
     --train_batch_size 16 \
     --eval_batch_size 32 \
@@ -124,7 +124,7 @@ python attack.py \
 #### MHM-LS
 ```shell
 cd code
-CUDA_VISIBLE_DEVICES=1 python mhm.py \
+CUDA_VISIBLE_DEVICES=6 python mhm.py \
     --output_dir=./saved_models/gcjpy \
     --model_type=roberta \
     --number_labels 66 \
 
@@ -209,7 +209,7 @@ def main():
     with open(codes_file_path) as rf:
         for line in rf:
             item = json.loads(line.strip())
-            source_codes.append(item["code"])
+            source_codes.append(item["code"].replace("\\n", "\n").replace('\"','"'))
             substs.append(item["substitutes"])
     assert(len(source_codes) == len(eval_dataset) == len(substs))
 
 
@@ -0,0 +1,24 @@
+import csv
+import json
+import random
+
+csv.field_size_limit(100000000)
+
+def main():
+    total_count = 0
+    greedy_succ = 0
+
+    with open("./attack_gi.csv") as rf:
+        reader = csv.DictReader(rf)
+        for row in reader:
+            if not row["Is Success"] == "-4":
+                total_count += 1
+            if row["Is Success"] == "1":
+                greedy_succ += 1
+    print(greedy_succ)
+    print(total_count)
+    print(float(int(greedy_succ))/int(total_count))
+
+if __name__ == "__main__":
+    main()
+           
@@ -165,7 +165,7 @@ def main():
     with open(codes_file_path) as rf:
         for line in rf:
             item = json.loads(line.strip())
-            source_codes.append(item["code"])
+            source_codes.append(item["code"].replace("\\n", "\n").replace('\"','"'))
             substs.append(item["substitutes"])
     assert(len(source_codes) == len(eval_dataset) == len(substs))
 
@@ -192,14 +192,8 @@ def main():
     for index, example in enumerate(eval_dataset):
         code = source_codes[index]
         subs = substs[index]
-        identifiers, code_tokens = get_identifiers(code, lang='python')
-        code_tokens = [i for i in code_tokens]
-        processed_code = " ".join(code_tokens)
 
-        new_feature = convert_code_to_features(processed_code, tokenizer, example[1].item(), args)
-        new_dataset = CodeDataset([new_feature])
-
-        orig_prob, orig_label = model.get_results(new_dataset, args.eval_batch_size)
+        orig_prob, orig_label = model.get_results([example], args.eval_batch_size)
         orig_prob = orig_prob[0]
         orig_label = orig_label[0]
         ground_truth = example[1].item()
@@ -234,7 +228,7 @@ def main():
         print ("  curr succ rate = "+str(n_succ/total_cnt))
         print("Query times in this attack: ", model.query - query_times)
         print("All Query times: ", model.query)
-        recoder.writemhm(index, code, _res["prog_length"], " ".join(_res['tokens']), ground_truth, orig_label, _res["new_pred"], _res["is_success"], _res["old_uid"], _res["score_info"], _res["nb_changed_var"], _res["nb_changed_pos"], _res["replace_info"], _res["attack_type"], model.query - query_times, time_cost)
+        recoder.writemhm(index, code, _res["prog_length"], _res['tokens'], ground_truth, orig_label, _res["new_pred"], _res["is_success"], _res["old_uid"], _res["score_info"], _res["nb_changed_var"], _res["nb_changed_pos"], _res["replace_info"], _res["attack_type"], model.query - query_times, time_cost)
         query_times = model.query
 
 if __name__ == "__main__":
 
@@ -104,6 +104,7 @@ def __init__(self, tokenizer, args, file_path=None):
             self.examples = torch.load(cache_file_path)
             with open(code_pairs_file_path, 'rb') as f:
                 code_files = pickle.load(f)
+            
             logger.info("Loading features from cached file %s", cache_file_path)
 
         except:
@@ -112,6 +113,7 @@ def __init__(self, tokenizer, args, file_path=None):
             with open(file_path) as f:
                 for line in f:
                     code = line.split(" <CODESPLIT> ")[0]
+                    code = code.replace("\\n", "\n").replace('\"','"')
                     label = line.split(" <CODESPLIT> ")[1]
                     # 将这俩内容转化成input.
                     self.examples.append(convert_examples_to_features(code, int(label), tokenizer,args))
 
@@ -378,7 +378,6 @@ def greedy_attack(self, example, substitutes, code):
         sorted_list_of_names = sorted(names_to_importance_score.items(), key=lambda x: x[1], reverse=True)
         # 根据importance_score进行排序
 
-        final_words = copy.deepcopy(words)
         final_code = copy.deepcopy(code_1)
 
         nb_changed_var = 0 # 表示被修改的variable数量
 
@@ -1,4 +1,4 @@
-CUDA_VISIBLE_DEVICES=0 python attack.py \
+CUDA_VISIBLE_DEVICES=1 python attack.py \
     --output_dir=./saved_models \
     --model_type=roberta \
     --config_name=microsoft/codebert-base \
@@ -11,10 +11,10 @@ CUDA_VISIBLE_DEVICES=0 python attack.py \
     --eval_data_file=../dataset/test_sampled_1500_2000.txt \
     --test_data_file=../dataset/test_sampled.txt \
     --block_size 512 \
-    --eval_batch_size 6 \
+    --eval_batch_size 16 \
     --seed 123456 2>&1| tee attack_GA_1500_2000.log &
 
-CUDA_VISIBLE_DEVICES=1 python attack.py \
+CUDA_VISIBLE_DEVICES=7 python attack.py \
     --output_dir=./saved_models \
     --model_type=roberta \
     --config_name=microsoft/codebert-base \
@@ -29,51 +29,3 @@ CUDA_VISIBLE_DEVICES=1 python attack.py \
     --block_size 512 \
     --eval_batch_size 16 \
     --seed 123456 2>&1| tee attack_GA_2000_2500.log &
-
-CUDA_VISIBLE_DEVICES=1 python attack.py \
-    --output_dir=./saved_models \
-    --model_type=roberta \
-    --config_name=microsoft/codebert-base \
-    --csv_store_path ./attack_GA_2500_3000.csv \
-    --model_name_or_path=microsoft/codebert-base \
-    --tokenizer_name=roberta-base \
-    --use_ga \
-    --base_model=microsoft/codebert-base-mlm \
-    --train_data_file=../dataset/train_sampled.txt \
-    --eval_data_file=../dataset/test_sampled_2500_3000.txt \
-    --test_data_file=../dataset/test_sampled.txt \
-    --block_size 512 \
-    --eval_batch_size 16 \
-    --seed 123456 2>&1| tee attack_GA_2500_3000.log &
-
-CUDA_VISIBLE_DEVICES=0 python attack.py \
-    --output_dir=./saved_models \
-    --model_type=roberta \
-    --config_name=microsoft/codebert-base \
-    --csv_store_path ./attack_GA_3000_3500.csv \
-    --model_name_or_path=microsoft/codebert-base \
-    --tokenizer_name=roberta-base \
-    --use_ga \
-    --base_model=microsoft/codebert-base-mlm \
-    --train_data_file=../dataset/train_sampled.txt \
-    --eval_data_file=../dataset/test_sampled_3000_3500.txt \
-    --test_data_file=../dataset/test_sampled.txt \
-    --block_size 512 \
-    --eval_batch_size 16 \
-    --seed 123456 2>&1| tee attack_GA_3000_3500.log &
-
-CUDA_VISIBLE_DEVICES=1 python attack.py \
-    --output_dir=./saved_models \
-    --model_type=roberta \
-    --config_name=microsoft/codebert-base \
-    --csv_store_path ./attack_GA_3500_4000.csv \
-    --model_name_or_path=microsoft/codebert-base \
-    --tokenizer_name=roberta-base \
-    --use_ga \
-    --base_model=microsoft/codebert-base-mlm \
-    --train_data_file=../dataset/train_sampled.txt \
-    --eval_data_file=../dataset/test_sampled_3500_4000.txt \
-    --test_data_file=../dataset/test_sampled.txt \
-    --block_size 512 \
-    --eval_batch_size 16 \
-    --seed 123456 2>&1| tee attack_GA_3500_4000.log &
@@ -0,0 +1,59 @@
+import pandas as pd
+from random import sample
+
+
+def random_selection():
+    fields = ['Index', 'Is Success']
+    # read specific columns
+    mhm_path = './results/attack_mhm.csv'
+    gi_path = './results/attack_genetic.csv'
+    index_mhm = pd.read_csv(mhm_path, skipinitialspace=True, usecols=fields)
+    index_gi = pd.read_csv(gi_path, skipinitialspace=True, usecols=fields)
+    mhm_success = index_mhm[index_mhm['Is Success'] == 1]
+    gi_success = index_gi[index_gi['Is Success'] == 1]
+    print(type(gi_success))
+    intersect = list(set(mhm_success['Index'].values.tolist()).intersection(set(gi_success['Index'].values.tolist())))
+    print(len(intersect))
+    # samples = sample(intersect, 100)
+    #
+    # print(samples)
+    # print(len(set(samples)))
+    # return samples
+
+    return intersect
+
+def filter_csv(index):
+    mhm_path = './results/attack_mhm.csv'
+    gi_path = './results/attack_genetic.csv'
+    index_mhm = pd.read_csv(mhm_path)
+    index_gi = pd.read_csv(gi_path)
+
+    mhm = index_mhm.loc[index_mhm['Index'].isin(index)]
+    gi = index_gi.loc[index_gi['Index'].isin(index)]
+
+    data = [gi["Index"], gi["Original Code"], gi["Adversarial Code"], gi["Extracted Names"], gi["Replaced Names"],
+            mhm["Adversarial Code"], mhm["Extracted Names"], mhm["Replaced Names"],]
+
+    headers = ["Index", "Original", "GA_Adversarial Code", "GA_Extracted Names", "GA_Replaced Names",
+           "mhm_Adversarial Code", "mhm_Extracted Names", "mhm_Replaced Names",]
+    gi.to_csv('gi.csv', index=False)
+    mhm.to_csv('mhm.csv', index=False)
+
+    print(mhm)
+    df3 = pd.concat(data, axis=1, keys=headers)
+    df3.to_csv('total.csv', index=False)
+
+    print(df3)
+
+def write_attack_files(index):
+    f_original = open("original.txt", "w")
+    f_mhm = open("mhm_attack.txt", "w")
+    f_ga = open("ga_attack.txt", "w")
+
+
+def main():
+    indexes = random_selection()
+    filter_csv(indexes)
+
+if __name__ == '__main__':
+    main()
@@ -79,18 +79,18 @@ We use full train data for fine-tuning. The training cost is 50 mins on 8*P100-1
 
 ```shell
 cd code
-python run.py \
-    --output_dir=./saved_models \
+CUDA_VISIBLE_DEVICES=4,6 python run.py \
+    --output_dir=./adv_saved_models \
     --model_type=roberta \
     --tokenizer_name=microsoft/codebert-base \
     --model_name_or_path=microsoft/codebert-base \
     --do_train \
-    --train_data_file=../preprocess/dataset/train.jsonl \
+    --train_data_file=../preprocess/dataset/adv_train.jsonl \
     --eval_data_file=../preprocess/dataset/valid.jsonl \
     --test_data_file=../preprocess/dataset/test.jsonl \
     --epoch 5 \
     --block_size 512 \
-    --train_batch_size 32 \
+    --train_batch_size 24 \
     --eval_batch_size 64 \
     --learning_rate 2e-5 \
     --max_grad_norm 1.0 \
@@ -105,14 +105,14 @@ We use full valid data to evaluate. The inferencing cost is 1 min on 8*P100-16G.
 ```shell
 cd code
 CUDA_VISIBLE_DEVICES=6 python run.py \
-    --output_dir=./adv_saved_models \
+    --output_dir=./saved_models \
     --model_type=roberta \
     --tokenizer_name=microsoft/codebert-base \
     --model_name_or_path=microsoft/codebert-base \
     --do_test \
     --train_data_file=../preprocess/dataset/train.jsonl \
     --eval_data_file=../preprocess/dataset/valid.jsonl \
-    --test_data_file=../preprocess/dataset/test.jsonl \
+    --test_data_file=../preprocess/dataset/adv_test.jsonl \
     --epoch 5 \
     --block_size 512 \
     --train_batch_size 32 \
@@ -126,11 +126,11 @@ CUDA_VISIBLE_DEVICES=6 python run.py \
 ## Attack
 
 If you don't want to be bothered by fine-tuning models, you can download the victim model into `code/saved_models/checkpoint-best-acc` by [this link](https://drive.google.com/file/d/14STf95S3cDstI5CiyvK1giLlbDw4ZThu/view?usp=sharing).
-
+ADV: https://drive.google.com/file/d/1CR3SWBlyMZLnctZklAHMFf0Jq1U7YdsZ/view?usp=sharing
 ```shell
 pip install gdown
 mkdir -p code/saved_models/checkpoint-best-acc
-gdown https://drive.google.com/uc?id=14STf95S3cDstI5CiyvK1giLlbDw4ZThu
+gdown https://drive.google.com/uc?id=1CR3SWBlyMZLnctZklAHMFf0Jq1U7YdsZ
 mv model.bin code/saved_models/checkpoint-best-acc/
 ```
 
@@ -227,19 +227,19 @@ python get_substitutes.py \
 ### Attack microsoft/codebert-base-mlm
 ```shell
 cd code
-CUDA_VISIBLE_DEVICES=0 python gi_attack.py \
-    --output_dir=./saved_models \
+CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
+    --output_dir=./adv_saved_models \
     --model_type=roberta \
     --tokenizer_name=microsoft/codebert-base-mlm \
     --model_name_or_path=microsoft/codebert-base-mlm \
-    --csv_store_path ./attack_no_gi.csv \
+    --csv_store_path ./attack_no_gitest_subs_400_800_.csv \
     --base_model=microsoft/codebert-base-mlm \
     --train_data_file=../preprocess/dataset/train_subs.jsonl \
-    --eval_data_file=../preprocess/dataset/valid_subs.jsonl \
+    --eval_data_file=../preprocess/dataset/test_subs_400_800.jsonl \
     --test_data_file=../preprocess/dataset/test_subs.jsonl \
     --block_size 512 \
     --eval_batch_size 64 \
-    --seed 123456  2>&1 | tee attack_no_gi.log
+    --seed 123456  2>&1 | tee attack_no_gitest_subs_400_800_.log
 ```
 
 # Genetic Programming
@@ -255,7 +255,7 @@ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
     --base_model=microsoft/codebert-base-mlm \
     --use_ga \
     --train_data_file=../preprocess/dataset/train_subs.jsonl \
-    --eval_data_file=../preprocess/dataset/valid_subs.jsonl \
+    --eval_data_file=../preprocess/dataset/test_subs_0_400.jsonl \
     --test_data_file=../preprocess/dataset/test_subs.jsonl \
     --block_size 512 \
     --eval_batch_size 64 \
@@ -273,15 +273,15 @@ CUDA_VISIBLE_DEVICES=4 python gi_attack.py \
 # MHM-Attack
 ```shell
 cd code
-CUDA_VISIBLE_DEVICES=6 python mhm_attack.py \
+CUDA_VISIBLE_DEVICES=2 python mhm_attack.py \
     --output_dir=./saved_models \
     --model_type=roberta \
     --tokenizer_name=microsoft/codebert-base \
     --model_name_or_path=microsoft/codebert-base \
     --csv_store_path ./attack_mhm_ls.csv \
     --base_model=microsoft/codebert-base-mlm \
     --train_data_file=../preprocess/dataset/train_subs.jsonl \
-    --eval_data_file=../preprocess/dataset/valid_subs.jsonl \
+    --eval_data_file=../preprocess/dataset/test_subs_0_400.jsonl \
     --test_data_file=../preprocess/dataset/test_subs.jsonl \
     --block_size 512 \
     --eval_batch_size 64 \
 
@@ -92,7 +92,7 @@ def convert_examples_to_features(js,tokenizer,args):
     source_ids =  tokenizer.convert_tokens_to_ids(source_tokens)
     padding_length = args.block_size - len(source_ids)
     source_ids+=[tokenizer.pad_token_id]*padding_length
-    return InputFeatures(source_tokens,source_ids,js['idx'],js['target'])
+    return InputFeatures(source_tokens,source_ids,js['idx'],int(js['target']))
 
 class TextDataset(Dataset):
     def __init__(self, tokenizer, args, file_path=None):
 
@@ -0,0 +1,22 @@
+import csv
+import json
+import random
+from tqdm import tqdm
+csv.field_size_limit(100000000)
+adv_data = []
+for index in [(0,400), (400,800), (800,1200), (1200,1600), (1600,2000), (2000,2400), (2400,2800)]:
+    with open("../code/attack_genetic_test_subs_"+str(index[0])+"_"+str(index[1])+".csv") as rf:
+        reader = csv.DictReader(rf)
+        for row in reader:
+            if not len(row["Adversarial Code"]) == 0:
+                adv_data.append({"target":int(row["True Label"]), "func":row["Adversarial Code"], "idx":None})
+print(len(adv_data))
+# with open("./dataset/train.jsonl") as rf:
+#     for line in rf:
+#         adv_data.append(json.loads(line.strip()))
+# print(len(adv_data))
+random.shuffle(adv_data)
+
+with open("./dataset/adv_test.jsonl", "w") as wf:
+    for item in tqdm(adv_data):
+        wf.write(json.dumps(item)+'\n')