-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathCreateVocabulary.py
50 lines (42 loc) · 1.69 KB
/
CreateVocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!\Python36\python.exe
import json
import re
import os
from nltk.stem import PorterStemmer
import PreProcessing as pp
ps = PorterStemmer()
def GenerateVocabularyData(stemming, vocabulary_path, train_data_path):
with open(train_data_path) as tdf:
train_data = json.load(tdf)
vocabulary_dict = {}
unigram_set = set()
bigram_set = set()
index = 0
TotalPositiveWordCount = 0
TotalNegativeWordCount = 0
for key in train_data:
for value in train_data.get(key):
pp_value = pp.StopWordAndSpecialCharRemoval(value, stemming)
if stemming == True:
bigrams = [b for b in zip(pp.perform_stemming(re.split('\s+',pp_value)[:-1]), pp.perform_stemming((re.split('\s+',pp_value)[1:])))]
unigrams = [ps.stem(word) for word in pp_value]
else:
bigrams = [b for b in zip(re.split('\s+',pp_value)[:-1], re.split('\s+',pp_value)[1:])]
unigrams = [u for u in re.split('\s+',pp_value)]
temp_bigram_set = set()
for bigram in bigrams:
temp_bigram_set.add(bigram)
bigram_set = bigram_set | temp_bigram_set #to append sets
for unigram in unigrams:
unigram_set.add(unigram)
if key == 'positive':
TotalPositiveWordCount = len(set(unigrams)) + len(temp_bigram_set) + TotalPositiveWordCount
else:
TotalNegativeWordCount = len(set(unigrams)) + len(temp_bigram_set) + TotalNegativeWordCount
vocabulary_dict.update({'unigram': list(unigram_set)})
vocabulary_dict.update({'bigram': list(bigram_set)})
if os.path.exists(vocabulary_path):
os.remove(vocabulary_path)
with open(vocabulary_path, 'w') as outfile:
json.dump(vocabulary_dict, outfile, sort_keys=True, indent=4)
return {'PositiveWordCount': TotalPositiveWordCount, 'NegativeWordCount' : TotalNegativeWordCount}