-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexport_bodies.py
37 lines (24 loc) · 1.36 KB
/
export_bodies.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import pandas as pd
"""export all of the email bodies to a csv file for data labelling
using the full text rather than preprocessed, but still need to extract enron"""
# Educational institute data
dfEdu = pd.read_csv('./educational-institute-dataset/PhishingEmailData.csv', encoding="ISO-8859-1", usecols=['Email_Subject', 'Email_Content', 'Closing_Remarks'])
dfEdu['Body'] = dfEdu[dfEdu.columns[1:]].apply(
lambda x: '\n'.join(x.dropna().astype(str)),
axis=1
)
dfEdu = dfEdu.drop(columns=['Email_Subject', 'Email_Content', 'Closing_Remarks'])
# Email Spam Dataset
dfSA = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/completeSpamAssassin.csv', usecols=['Body'])
dfEnron = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/enronSpamSubset.csv', usecols=['Body']).tail(-1)
dfLing = pd.read_csv('./kaggle-datasets/Email-Spam-Dataset/lingSpam.csv', usecols=['Body'])
# Enron Full dataset
# Reading from this file requires having run process_enron() in classif.ipynb
# We also do not remove duplicates in this file
dfEnronFull = pd.read_csv('./Processed-Datasets/Enron-Bodies/emails.csv', usecols=['message']).rename(columns={"message":"Body"})
# Merge all
frames = [dfEdu, dfSA, dfEnron, dfLing, dfEnronFull]
dfAll = pd.concat(frames).dropna()
print(dfAll.head())
input("Correct? any key to continue")
dfAll.to_csv("./Processed-Datasets/all-bodies.csv", index=False)