-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlist_concepts_hate_speech.py
61 lines (51 loc) · 2.9 KB
/
list_concepts_hate_speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import json
from collections import defaultdict
import os
import pandas as pd
import re
def list_reference_concepts(infolder, out_topfolder, path_to_data, path_to_dev_ids, path_to_test_ids):
main_df = pd.read_csv(path_to_data)
dev_set_ids = json.load(open(path_to_dev_ids, "r"))
test_set_ids = json.load(open(path_to_test_ids, "r"))
excluded_verbs = ["'m", "is", "'re", "am", "are", "be", "'ll", "will", "shall", "were", "was", "been", "'s", "im",
"ur", "u", "lol", "sorry", "notsexist", "fuck", "shit", "pls", "please", "t", "s", "'ve", "have",
"has"]
if not os.path.exists(out_topfolder):
os.makedirs(out_topfolder)
for inputfile in os.listdir(infolder):
collection_name = re.sub("_concepts_extracted.json", "", inputfile)
curclass = collection_name.split("_")[-1]
curdf = main_df[main_df["Class"] == curclass].copy()
curdf["text_id"] = range(curdf.shape[0])
train_set_text_ids = set(
curdf.loc[sorted(list((set(curdf.index.tolist()) - set(test_set_ids)) - set(dev_set_ids)))][
"text_id"].tolist())
concepts = json.load(open(os.path.join(infolder, inputfile), "r"))
concdict = {}
for type in concepts:
concdict[type] = defaultdict(lambda: 0)
for conc in concepts[type]:
if (conc['text_id'] in train_set_text_ids and (conc['next_tag'][0] == 'V' or conc['next_tag'] == 'MD')
and not conc['next_word'].lower() in excluded_verbs):
concdict[type][tuple([conc["text"].lower(), tuple([pos[:2] for pos in conc["postags"].split(" ")])])] += 1
concdict[type] = dict(concdict[type])
outfolder = os.path.join(out_topfolder, "concepts_by_pos_%s" % (collection_name))
if not os.path.exists(outfolder):
os.makedirs(outfolder)
for dictkey in concdict:
output_filename = outfolder + "/%s_concepts_tf_%s.csv" % (
dictkey.lower() if dictkey.lower() != "concepts" else "noun_phrase_and_numeric", collection_name)
df = pd.DataFrame()
concepts, tf = zip(*list(concdict[dictkey].items()))
conc, pos = zip(*concepts)
df["concept"] = list(conc)
df["pos"] = [" ".join(list(p)) for p in pos]
df["tf_collection"] = list(tf)
df = df.sort_values("tf_collection", ascending=False)
df.to_csv(output_filename, index=False)
if __name__ == "__main__":
list_reference_concepts(infolder="processed_data/concepts_extracted_with_next_tag_next_word",
out_topfolder="processed_data/reference_concepts_followed_by_verb_not_aux",
path_to_data="data/all_data_waseem.csv",
path_to_dev_ids="data/post_ids_dev.json",
path_to_test_ids="data/post_ids_test.json")