-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathpreprocess.py
129 lines (111 loc) · 4.19 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""
Preprocesses tweets
Usage:
$python3 preprocess.py <preprocessed.csv> <train/test>
(third argument is a flag to select which kind of data to process)
"""
import re
import sys
from utils import write_status
from nltk.stem.porter import PorterStemmer
def preprocess_word(word):
# Remove punctuation
word = word.strip('\'"?!,.():;')
# Convert more than 2 letter repetitions to 2 letter
# funnnnny --> funny
word = re.sub(r'(.)\1+', r'\1\1', word)
# Remove - & '
word = re.sub(r'(-|\')', '', word)
return word
def is_valid_word(word):
# Check if word begins with an alphabet
return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)
def handle_emojis(tweet):
# Smile -- :), : ), :-), (:, ( :, (-:, :')
tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
# Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
# Love -- <3, :*
tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
# Wink -- ;-), ;), ;-D, ;D, (;, (-;
tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
# Sad -- :-(, : (, :(, ):, )-:
tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
# Cry -- :,(, :'(, :"(
tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
return tweet
def preprocess_tweet(tweet):
processed_tweet = []
# Convert to lower case
tweet = tweet.lower()
# Replaces URLs with the word URL
tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
# Replace @handle with the word USER_MENTION
tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
# Replaces #hashtag with hashtag
tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
# Remove RT (retweet)
tweet = re.sub(r'\brt\b', '', tweet)
# Replace 2+ dots with space
tweet = re.sub(r'\.{2,}', ' ', tweet)
# Strip space, " and ' from tweet
tweet = tweet.strip(' "\'')
# Replace emojis with either EMO_POS or EMO_NEG
tweet = handle_emojis(tweet)
# Replace multiple spaces with a single space
tweet = re.sub(r'\s+', ' ', tweet)
words = tweet.split()
for word in words:
word = preprocess_word(word)
if is_valid_word(word):
if use_stemmer:
word = str(porter_stemmer.stem(word))
processed_tweet.append(word)
return ' '.join(processed_tweet)
def preprocess_csv(csv_file_name, processed_file_name, test_file=False):
save_to_file = open(processed_file_name, 'w')
with open(csv_file_name, 'r', encoding='utf-8') as csv:
lines = csv.readlines()
total = len(lines)
for i, line in enumerate(lines):
tweet_id = line[:line.find(',')]
if not test_file:
line = line[1 + line.find(','):]
positive = int(line[:line.find(',')])
line = line[1 + line.find(','):]
line = str(line)
tweet = line
processed_tweet = preprocess_tweet(tweet)
if not test_file:
save_to_file.write('%s,%d,%s\n' %
(tweet_id, positive, processed_tweet))
else:
save_to_file.write('%s,%s\n' %
(tweet_id, processed_tweet))
write_status(i + 1, total)
save_to_file.close()
print('\nSaved processed tweets to: %s' % processed_file_name)
return processed_file_name
if __name__ == '__main__':
if len(sys.argv) != 3:
print('Usage: python preprocess.py <raw-CSV> test/train')
exit()
if sys.argv[2].lower() == "test":
flag = True
elif sys.argv[2].lower() == 'train':
flag = False
else:
print('\nTest/Train file flag not properly defined, processing '
'training file')
flag = False
if flag:
print('\nProcessing test dataset')
else:
print('\nProcessing train dataset')
use_stemmer = False
csv_file_name = sys.argv[1]
processed_file_name = sys.argv[1][:-4] + '-processed.csv'
if use_stemmer:
porter_stemmer = PorterStemmer()
processed_file_name = sys.argv[1][:-4] + '_processed-stemmed.csv'
preprocess_csv(csv_file_name, processed_file_name, test_file=flag)