-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalyzer.py
126 lines (92 loc) · 3.83 KB
/
analyzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import nltk # type: ignore
import json
import re
import string
from typing import List, Dict, Tuple
from itertools import islice, combinations
from functools import partial, reduce
import matplotlib.pyplot as plt # type: ignore
from nltk.classify import NaiveBayesClassifier # type: ignore
from nltk.corpus import subjectivity # type: ignore
from nltk.sentiment import SentimentAnalyzer # type: ignore
from nltk.sentiment.vader import SentimentIntensityAnalyzer # type: ignore
from nltk.sentiment.util import * # type: ignore
wnl = nltk.WordNetLemmatizer()
journal_file = "out/Journal.json"
analyzed_path = "out/Analyzed_Journal.json"
def tokens_then_text(input: str):
tokens = nltk.word_tokenize(input)
return nltk.Text(tokens)
def collect_by(key, acc, item):
if item[key] not in acc:
acc[item[key]] = [item]
else:
acc[item[key]].append(item)
return acc
def collect_unique(entries: List[Dict], key) -> Dict:
collect_by_key = partial(collect_by, key)
return reduce(collect_by_key, entries, {})
def analyze_by_author(entries: List[Dict]) -> Tuple[str, str]:
get_content = partial(map, lambda entry: entry["content"])
def pipeline_per_author(entries): return "\n".join(
get_content(entries)) # type: ignore
uniques = collect_unique(entries, "author")
authors = uniques.keys()
entries = uniques.values()
return zip(authors, map(pipeline_per_author, entries))
def read_in_json(filename: str):
try:
with open(filename, "r") as journal:
return json.load(journal)
except IOError:
print("Could not read the file ", filename)
def enhance_entry(entry: Dict, content: str) -> Dict:
# Add nltk stats, scores and breakdowns to the entry
tokens = nltk.word_tokenize(content)
lemmatized = [wnl.lemmatize(token) for token in tokens]
sentences = nltk.sent_tokenize(content)
sid = SentimentIntensityAnalyzer()
sentiment_score = sid.polarity_scores(content)
entry["sentences"] = sentences
entry["tokens"] = tokens
entry["word_density"] = len(tokens) / len(sentences)
entry["lemmatized"] = lemmatized
entry["sentiment_score"] = sentiment_score
return entry
def filter_punctuation(tokens):
# Remove punctuation from a token
return [token for token in tokens if re.search(f"[^{string.punctuation}’]", token)]
def author_conditional_frequency(entries_by_author):
pronouns = ["i", "me", "you", "us", "he",
"she", "him", "her", "they", "them"]
return nltk.ConditionalFreqDist(
(author, word)
for author, entry in entries_by_author
for word in nltk.word_tokenize(entry.lower())
if word in pronouns
)
def to_percent(count, tokens):
return count / len(set(tokens)) * 100
def get_frequencies(tokens, start=0, sample_size=5):
frequencies = nltk.FreqDist(tokens)
sliced = frequencies.most_common(
)[start: start + sample_size % len(frequencies)]
return [(word, to_percent(frequency, tokens)) for word, frequency in sliced]
def plot_frequencies(frequent_words, word_frequencies):
plt.plot(frequent_words, word_frequencies)
entries = read_in_json(journal_file)
entries_by_author = list(analyze_by_author(entries))
map(lambda entry: enhance_entry(entry, entry["content"]), entries)
author_conditional_frequency(entries_by_author)
for author, aggregate in entries_by_author:
tokens = filter_punctuation(nltk.word_tokenize(aggregate))
frequencies = get_frequencies(tokens, 0, 100)
# zip(*list_of_tuples) actually unzips
plot_list = map(list, zip(*frequencies))
plot_frequencies(*plot_list)
normalized_frequences = [(token, to_percent(count, tokens))
for token, count in frequencies[:10]]
print(normalized_frequences)
plt.show()
with open(analyzed_path, "w") as output:
output.write(json.dumps(entries))