-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwordCloud_parallel.py
196 lines (163 loc) · 8.1 KB
/
wordCloud_parallel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
from wordCloud_FN import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import concurrent.futures
def processFunction (rows, nrows):
'''
processFunction (rows, nrows):
is a functions to execute steps
to perform the data preparation for wordcloud.
parameter:
rows (int) -> number of rows in a csv file for NOT from 1 to "rows"
nrows (int) -> number of rows to load in a dataframe
return
text (string) -> to input in a wordcloud function.
The data preparation is applied in following steps:
Step 1: Remove all 'remove all 'https://t.co/' + 10
Step 2: Remove all emoji
Step 3: lower case on text
Step 4: remove first set of tailor
Step 5: remove the punctuation
Step 6: tokenization
Step 8: remove English stopwrods
Step 7: lemmatization
Step 8: remove second set of tailor stopwords
Step 9: join all the words together seperated by " " within a row
Step 10: join all rows into a string
(These functions are written in wordCloud_FN.py
please read the docstring "printDoc.py" for details).
'''
#### Tailor stopwords ######################################################
#
# Make a list of tailor topwords to
# remove these words to retrieve better information from wordcloud
#
##################################################################################
# tailor_stopwords = []
tailor_stopwords = ['olympic', 'tokyoolympic', 'olympicgame','answer','tokyo2020','like']
#'answer slovesateez','slovesateez','slovesateez', 'solvesateez',\
#'slovesateez dreamer']
replaced_to = ""
##### Reading partial csv file ############################################
#
# Instead of load the entire csv file in a single dataframe,
# we load a range of rows in the csv file in a smaller datafraeme.
# Parallel programming and iteration will be discussed below.
# This function slip "rows" from the first (1) row and
# read "nrows".
# Only read the columns 'text' and 'data'
#
##################################################################################
path = ""
# filename = "intermediate_overall.csv"
filename = "Olympics_Tokyo_tweets.csv"
df = pd.read_csv(os.path.join(path, filename), header = 0,
skiprows =range (1, rows),
nrows = nrows,
usecols=['text', 'date'])
#### MASK THE DATE #########################################################
#
# Mask certin rows in relation to the "date" column
# Retrieve the rows within the mask
#
##################################################################################
df['date'] = pd.to_datetime(df['date'], errors = 'coerce').dt.normalize()
start_date = '2021-07-22'
end_date = '2021-08-10'
#start_date = '2021-08-01'
#end_date = '2021-08-10'
mask = (df['date'] > start_date) & (df['date'] <= end_date)
df = df.loc[mask]
#### Step for data preparation #############################################
#
# Plase refer documentation above.
#
##################################################################################
df = df_remove_http (df, 'text', 'no_http') ## step 1: remove all 'https://t.co/' + 10
df['no_http'] = df['no_http'].apply(remove_emoji) ## step 2: remove emoji
df = df_lowerCase (df, 'no_http', 'lower_text') ## step 3: lower case on text
df['lower_text'] = df['lower_text'].\
replace(dict (zip (tailor_stopwords,\
[replaced_to] *len(tailor_stopwords))),\
regex = True) ## step 4: remove first set tailor stopwords
df = df_remove_punctuation (df, 'lower_text', 'no_pun_text') ## step 5: remove the punctuation
df= df_tokenization(df, 'no_pun_text', 'tok_words') ## step 6: tokenization
df= df_remove_stopwords(df, 'tok_words','no_stopwords' ) ## step 7: remove English stopwords
df = df_lemmatizer (df, 'no_stopwords', 'lem_words') ## step 8: lemmatization
df['lem_words'] = df['lem_words'].\
apply(remove_another_stopwords) ## step 9: remove second set of tailor stopwords
df['lem_words'] = df['lem_words'].\
apply(lambda x: ' '.join([word for word in x])) ## step 10: join words in every rows
text = " ".join(review for review in df.lem_words) ## step 11: join all words in a single text
return text
def main():
print (processFunction.__doc__)
# filename = "intermediate_overall.csv"
filename = "Olympics_Tokyo_tweets.csv"
stop_words = set (stopwords.words("english"))
t1 = time.perf_counter()
print ("start....")
#### Divide the datesets for parallel programming #########################
#
# The dataset of "Olympics_Tokyo_tweets.csv" have around 320,000 rows.
# The processFunction will load "skip" numbers of rows x "range_words" times.
# We will use parallel programming to run the processFunction
# The develop of this program has a i7-8700 intel CPU (6 cores CPU).
# Optimization for entire dataset:
# 54,000 (slip) x 6 (range_words) = 324,000 < 320,000 rows.
# We can fine tune slip and range_words if we use mask "date" functions.
#
##################################################################################
skip = 54000
range_words = 6
rows = list()
nrows = list()
wholeText = ""
totalWords = 0
for n in range (range_words):
rows.append((skip * n)+ 1 )
nrows.append(skip)
print (f"Preform parallel running on {range_words} times.")
print (f"Read the datafile \"{filename}\" of {nrows[0]} rows and start each run in rows as follows: \n{rows}")
##### parallel run function ##############################################
#
# parameter:
# processFunction ( , )
# parameters of the processFunction
# rows (int) -> number of rows in a csv file for NOT from 1 to "rows"
# nrows (int) -> number of rows to load in a dataframe
# return:
# results (list) -> a list of string after data preparation
#
##################################################################################
with concurrent.futures.ProcessPoolExecutor() as executor:
#with concurrent.futures.ProcessPoolExecutor(max_workers = 6) as executor:
#with concurrent.futures.ThreadPoolExecutor() as executor:
results = executor.map(processFunction, rows, nrows)
##### creating the Word Cloud ###############################################
#
# join all the text from the list of results together
# apply the stop_words here
# create the wordcloud
#
##################################################################################
for result in results:
wholeText = wholeText + " " + result
totalWords += len(wholeText)
print ("There are {} words in the combination of all review.".format(totalWords))
final_wordcloud = WordCloud(width = 968, height = 968,
background_color ='black',
stopwords = stop_words,
min_font_size = 10).generate(wholeText)
# Displaying the WordCloud
plt.figure(figsize = (10, 10), facecolor = None)
plt.imshow(final_wordcloud)
plt.axis("off")
plt.tight_layout(pad = 0)
with open ('results_overall.txt', 'w', encoding='utf-8') as f:
f.write(wholeText)
t2 = time.perf_counter()
print(f'Finished in {t2-t1} seconds')
plt.show()
if __name__ == '__main__':
main()