Skip to content

Commit e923f0e

Browse files
authored
Add files via upload
1 parent b7c06b4 commit e923f0e

22 files changed

+728
-0
lines changed

DataProcessor.py

+201
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
import os
2+
import string
3+
4+
import nltk
5+
import numpy as np
6+
from nltk.tokenize import word_tokenize, sent_tokenize
7+
from nltk.stem import PorterStemmer
8+
from nltk.corpus import stopwords
9+
from nltk import pos_tag, ngrams
10+
from nltk.corpus import cmudict
11+
from textstat import textstat
12+
13+
from collections import Counter
14+
15+
from transformers import pipeline
16+
from transformers import BertTokenizer
17+
from SemanticRepetitionDetector import SemanticRepetitionDetector
18+
19+
20+
21+
22+
23+
24+
25+
# Ensure required NLTK downloads
26+
nltk.download('punkt')
27+
nltk.download('stopwords')
28+
nltk.download('cmudict')
29+
30+
31+
32+
class DataProcessor:
33+
def __init__(self):
34+
self.stop_words = set(stopwords.words('english'))
35+
try:
36+
self.cmu_dict = {word: min([len([y for y in pron if y[-1].isdigit()]) for pron in prons])
37+
for word, prons in cmudict.entries()}
38+
print("CMU Dictionary successfully loaded and processed.")
39+
except Exception as e:
40+
print(f"Failed to load or process CMU Dictionary: {e}")
41+
self.cmu_dict = {}
42+
43+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
44+
self.sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
45+
46+
self.feature_names = [
47+
'Document Length',
48+
'Mean Sentence Length',
49+
'Mean Word Length',
50+
'Readability',
51+
'Lexical Richness',
52+
'Semantic Repetition',
53+
'Function Words Frequency',
54+
'Content Words Frequency',
55+
'Punctuation Usage',
56+
'Sentiment Indicator',
57+
'Sentiment Strength'
58+
]
59+
60+
61+
def _get_feature_names(self):
62+
return self.feature_names
63+
64+
# functions for features
65+
### Features for Phraseology ###
66+
def _document_length(self, text):
67+
return len(word_tokenize(text))
68+
69+
def _mean_sentence_length(self, text):
70+
sentences = sent_tokenize(text)
71+
sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
72+
return np.mean(sentence_lengths) if sentence_lengths else 0
73+
74+
def _mean_word_length(self, text):
75+
tokens = [word for word in word_tokenize(text) if word.isalpha()] # Ignore punctuation
76+
lengths = [len(word) for word in tokens]
77+
return np.mean(lengths) if lengths else 0
78+
79+
### Features for Lexical Usage ###
80+
def _readability(self, text):
81+
return textstat.flesch_reading_ease(text)
82+
83+
def _lexical_richness(self, text):
84+
tokens = word_tokenize(text)
85+
types = len(set(tokens))
86+
tokens_total = len(tokens)
87+
return types / tokens_total if tokens_total > 0 else 0
88+
89+
90+
def _semantic_repetition(self,text):
91+
semantic_repeteition = SemanticRepetitionDetector()
92+
return semantic_repeteition.count_repetitions(text)
93+
94+
95+
def _function_words_frequency(self, text):
96+
tokens = word_tokenize(text.lower())
97+
function_words = [word for word in tokens if word in self.stop_words]
98+
return len(function_words) / len(tokens) if tokens else 0
99+
100+
def _content_words_frequency(self, text):
101+
# Tokenize and apply POS tagging
102+
tokens = word_tokenize(text)
103+
words_and_pos = pos_tag(tokens)
104+
content_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
105+
content_words = [word for word, pos in words_and_pos if pos in content_pos_tags and word.lower() not in self.stop_words]
106+
content_word_count = len(content_words)
107+
total_words = len(tokens)
108+
return content_word_count / total_words if total_words > 0 else 0
109+
110+
111+
112+
### Features for Punctuation Usage ###
113+
def _punctuation_usage(self, text):
114+
punctuations = Counter(char for char in text if char in string.punctuation)
115+
return sum(punctuations.values())
116+
117+
118+
### Features for Sentiment Analysis ###
119+
120+
def _sentiment_indicator(self, text):
121+
segments = self._split_into_segments(text)
122+
# Process in batches
123+
results = self.sentiment_pipeline(segments)
124+
125+
sentiment_score = sum(1 if res['label'] == 'POSITIVE' else -1 if res['label'] == 'NEGATIVE' else 0 for res in results)
126+
127+
if sentiment_score > 0:
128+
return 1 # Positive sentiment
129+
elif sentiment_score < 0:
130+
return -1 # Negative sentiment
131+
else:
132+
return 0 # Neutral sentiment
133+
134+
def _sentiment_strength(self, text):
135+
segments = self._split_into_segments(text)
136+
if not segments:
137+
return 0 # Return early if no segments
138+
139+
# Process all segments at once using batch processing
140+
results = self.sentiment_pipeline(segments)
141+
142+
# Initialize sentiment counts
143+
sentiment_counts = {1: 0, -1: 0, 0: 0}
144+
145+
# Update counts based on results from the batch processing
146+
for res in results:
147+
label = 1 if res['label'] == 'POSITIVE' else -1 if res['label'] == 'NEGATIVE' else 0
148+
sentiment_counts[label] += 1
149+
150+
# Return the count of the most predominant sentiment
151+
predominant_sentiment = max(sentiment_counts, key=sentiment_counts.get)
152+
return sentiment_counts[predominant_sentiment]
153+
154+
def _split_into_segments(self, text, max_length=510):
155+
# Use the BERT tokenizer to tokenize the text and manage the max token length
156+
tokens = self.tokenizer.tokenize(text)
157+
segments = []
158+
current_segment = []
159+
160+
for token in tokens:
161+
if len(current_segment) + 1 > max_length:
162+
segments.append(self.tokenizer.convert_tokens_to_string(current_segment))
163+
current_segment = []
164+
current_segment.append(token)
165+
166+
if current_segment:
167+
segments.append(self.tokenizer.convert_tokens_to_string(current_segment))
168+
169+
return segments
170+
171+
def _extract_features(self, text):
172+
features = [
173+
# Phraseology
174+
self._document_length(text),
175+
self._mean_sentence_length(text),
176+
self._mean_word_length(text),
177+
#Lexical
178+
self._readability(text),
179+
self._lexical_richness(text),
180+
self._semantic_repetition(text),
181+
self._function_words_frequency(text),
182+
self._content_words_frequency(text),
183+
# Punctuation
184+
self._punctuation_usage(text),
185+
# Sentiment
186+
self._sentiment_indicator(text),
187+
self._sentiment_strength(text),
188+
]
189+
return features
190+
191+
def create_feature_matrix(self, directory):
192+
features_matrix = []
193+
for root, _, files in os.walk(directory):
194+
for filename in files:
195+
file_path = os.path.join(root, filename)
196+
with open(file_path, 'r', encoding='utf-8') as f:
197+
text = f.read()
198+
features = self._extract_features(text)
199+
features_matrix.append(features)
200+
return np.array(features_matrix)
201+

Maya Angelou.pdf

3.44 MB
Binary file not shown.

README .txt

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
# Stylometry Approach for Detecting Writing Style Changes in Poetry text
2+
3+
4+
## Overview
5+
This project analyzes the stylistic changes in the poetry of Maya Angelou across different periods of her career using natural language processing techniques. The focus is on extracting and comparing stylistic topic modeling to understand the evolution of her writing style.
6+
7+
## Project Structure
8+
- `DataProcessor.py`: Contains functions for data preprocessing and feature extraction.
9+
- `TextModeler.py`: Includes methods for text modeling and analysis.
10+
- `Stylometry.py`: Tools for performing stylometric analysis.
11+
- `Visualizer.py`: Provides visualization functions for the analysis results.
12+
- `SemanticRepetitionDetector.py`: Detects semantic repetitions using BERT model
13+
14+
15+
## Extracting the Project
16+
17+
After downloading the NLP_project.zip file, extract it to your desired location.
18+
19+
## Running the Analysis
20+
21+
1. Navigate to the `NLP_project` folder.
22+
2. Ensure that the folders `initial state` and `final state` are present within the `Maya Angelou` folder.
23+
3. Run the main script file using Python. Ensure you have all the required dependencies installed.
24+
25+
## Notes
26+
- The script uses relative paths to access the corpus data. It expects the corpus folders to be in the `Maya Angelou` directory within the root project directory.
27+
- If you encounter any path errors, please check that the folder structure matches the expected format and that the script is executed from the root project directory.
28+
29+
30+
## Installation
31+
To run this project, you will need to install the required Python libraries. You can install them using the following command:
32+
33+
```bash
34+
pip install -r requirements.txt
35+
36+
37+
## IMPORTANT NOTE
38+
39+
The function def _semantic_repetition(self,text) in the class DataProcessor
40+
will run some +5 minutes since it runs a BERT model which for each poem checks for semantic repetition.
41+
42+
43+
44+
Daniel Adam, I.D. 342475639
45+
B.Sc. in Computer Science with specification in Data Science
46+
Topics in Natural Language Processing
47+
Ben Gurion University of the Negev
48+

Report of the project.pdf

1010 KB
Binary file not shown.

SemanticRepetitionDetector.py

+27
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
from transformers import BertTokenizer, BertModel
2+
import torch
3+
from nltk.tokenize import sent_tokenize
4+
import numpy as np
5+
6+
class SemanticRepetitionDetector:
7+
def __init__(self):
8+
self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
9+
self.model = BertModel.from_pretrained('bert-base-uncased')
10+
11+
def _get_embeddings(self, text):
12+
inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
13+
outputs = self.model(**inputs)
14+
return outputs.last_hidden_state.mean(1)
15+
16+
def count_repetitions(self, text, threshold=0.9):
17+
sentences = sent_tokenize(text)
18+
embeddings = [self._get_embeddings(sentence).detach().numpy() for sentence in sentences]
19+
repetition_count = 0
20+
21+
# Compare each sentence to every other sentence
22+
for i in range(len(embeddings)):
23+
for j in range(i + 1, len(embeddings)):
24+
sim = np.dot(embeddings[i], embeddings[j].T) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
25+
if sim > threshold:
26+
repetition_count += 1
27+
return repetition_count

Stylometry.py

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
import numpy as np
2+
import matplotlib.pyplot as plt
3+
from sklearn.decomposition import PCA
4+
from sklearn.cluster import KMeans
5+
from sklearn.metrics import silhouette_score
6+
import os
7+
8+
9+
class Stylometry:
10+
def __init__(self):
11+
self.pca = None
12+
self.kmeans = None
13+
self.n_components = None
14+
self.n_clusters = None
15+
self.train_data = None
16+
self.test_data = None
17+
self.reduced_train_data = None
18+
self.reduced_test_data = None
19+
self.train_labels = None
20+
self.test_labels = None
21+
22+
23+
def _choose_n_components(self, data, variance_threshold=0.95):
24+
pca = PCA()
25+
pca.fit(data)
26+
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
27+
n_components = np.where(cumulative_variance >= variance_threshold)[0][0] + 1
28+
return n_components
29+
30+
def _choose_n_clusters(self, data, k_range=range(2, 10)):
31+
scores = []
32+
ssd = []
33+
for k in k_range:
34+
kmeans = KMeans(n_clusters=k, random_state=42)
35+
labels = kmeans.fit_predict(data)
36+
score = silhouette_score(data, labels)
37+
scores.append(score)
38+
ssd.append(kmeans.inertia_)
39+
40+
41+
self._plot_elbow_method(ssd, k_range)
42+
optimal_k = k_range[np.argmax(scores)]
43+
return optimal_k
44+
45+
def _plot_elbow_method(self, ssd, k_range):
46+
plt.figure(figsize=(8, 4))
47+
plt.plot(k_range, ssd, 'bx-')
48+
plt.xlabel('k (number of clusters)')
49+
plt.ylabel('Sum of squared distances')
50+
plt.title('Elbow Method For Optimal k')
51+
plots_directory = r"Visualizations\Plots of clustering"
52+
if not os.path.exists(plots_directory):
53+
os.makedirs(plots_directory)
54+
file_path = os.path.join(plots_directory, f'Elbow Method for Optimal k.png')
55+
plt.savefig(file_path)
56+
plt.show()
57+
58+
def fit(self, train_data):
59+
self.train_data = train_data
60+
self.n_components = self._choose_n_components(train_data)
61+
self.pca = PCA(n_components=self.n_components)
62+
self.reduced_train_data = self.pca.fit_transform(train_data)
63+
self.n_clusters = self._choose_n_clusters(self.reduced_train_data)
64+
self.kmeans = KMeans(n_clusters=self.n_clusters)
65+
self.train_labels = self.kmeans.fit_predict(self.reduced_train_data)
66+
67+
def predict(self, test_data):
68+
if self.pca is None or self.kmeans is None:
69+
raise ValueError("Must fit on train data before predicting.")
70+
self.test_data = test_data
71+
self.reduced_test_data = self.pca.transform(test_data)
72+
self.test_labels = self.kmeans.predict(self.reduced_test_data)
73+
return self.test_labels
74+
75+
76+
def get_reduced_date(self):
77+
return self.reduced_train_data, self.reduced_test_data
78+
79+
def get_labels(self):
80+
return self.train_labels, self.test_labels
81+
82+
83+

0 commit comments

Comments
 (0)