danieladam7
diff --git a/‎DataProcessor.py
+201 b/‎DataProcessor.py
+201
diff --git a/‎Maya Angelou.pdf
3.44 MB b/‎Maya Angelou.pdf
3.44 MB
diff --git a/‎README .txt
+48 b/‎README .txt
+48
diff --git a/‎Report of the project.pdf
1010 KB b/‎Report of the project.pdf
1010 KB
diff --git a/‎SemanticRepetitionDetector.py
+27 b/‎SemanticRepetitionDetector.py
+27
diff --git a/‎Stylometry.py
+83 b/‎Stylometry.py
+83
@@ -0,0 +1,201 @@
+import os
+import string
+
+import nltk
+import numpy as np
+from nltk.tokenize import word_tokenize, sent_tokenize
+from nltk.stem import PorterStemmer
+from nltk.corpus import stopwords
+from nltk import pos_tag, ngrams
+from nltk.corpus import cmudict
+from textstat import textstat
+
+from collections import Counter
+
+from transformers import pipeline
+from transformers import BertTokenizer
+from SemanticRepetitionDetector import SemanticRepetitionDetector
+
+
+
+
+
+
+
+# Ensure required NLTK downloads
+nltk.download('punkt')
+nltk.download('stopwords')
+nltk.download('cmudict')
+
+
+
+class DataProcessor:
+    def __init__(self):
+        self.stop_words = set(stopwords.words('english'))
+        try:
+            self.cmu_dict = {word: min([len([y for y in pron if y[-1].isdigit()]) for pron in prons])
+                             for word, prons in cmudict.entries()}
+            print("CMU Dictionary successfully loaded and processed.")
+        except Exception as e:
+            print(f"Failed to load or process CMU Dictionary: {e}")
+            self.cmu_dict = {}
+
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        self.sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")
+
+        self.feature_names = [
+                            'Document Length',
+                            'Mean Sentence Length',
+                            'Mean Word Length',
+                            'Readability',
+                            'Lexical Richness',  
+                            'Semantic Repetition',                          
+                            'Function Words Frequency',
+                            'Content Words Frequency',
+                            'Punctuation Usage', 
+                            'Sentiment Indicator',
+                            'Sentiment Strength'
+                        ]  
+
+        
+    def _get_feature_names(self):
+        return self.feature_names
+    
+    # functions for features
+    ### Features for Phraseology ###
+    def _document_length(self, text):
+        return len(word_tokenize(text))
+    
+    def _mean_sentence_length(self, text):
+        sentences = sent_tokenize(text)
+        sentence_lengths = [len(word_tokenize(sentence)) for sentence in sentences]
+        return np.mean(sentence_lengths) if sentence_lengths else 0
+
+    def _mean_word_length(self, text):
+        tokens = [word for word in word_tokenize(text) if word.isalpha()]  # Ignore punctuation
+        lengths = [len(word) for word in tokens]
+        return np.mean(lengths) if lengths else 0
+
+    ### Features for Lexical Usage ###
+    def _readability(self, text):
+        return textstat.flesch_reading_ease(text)
+      
+    def _lexical_richness(self, text):
+        tokens = word_tokenize(text)
+        types = len(set(tokens))
+        tokens_total = len(tokens)
+        return types / tokens_total if tokens_total > 0 else 0
+    
+               
+    def _semantic_repetition(self,text):
+        semantic_repeteition = SemanticRepetitionDetector()
+        return semantic_repeteition.count_repetitions(text)
+
+    
+    def _function_words_frequency(self, text):
+        tokens = word_tokenize(text.lower())
+        function_words = [word for word in tokens if word in self.stop_words]
+        return len(function_words) / len(tokens) if tokens else 0
+    
+    def _content_words_frequency(self, text):
+        # Tokenize and apply POS tagging
+        tokens = word_tokenize(text)
+        words_and_pos = pos_tag(tokens)
+        content_pos_tags = {'NN', 'NNS', 'NNP', 'NNPS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS'}
+        content_words = [word for word, pos in words_and_pos if pos in content_pos_tags and word.lower() not in self.stop_words]
+        content_word_count = len(content_words)
+        total_words = len(tokens)
+        return content_word_count / total_words if total_words > 0 else 0   
+    
+        
+    
+    ### Features for Punctuation Usage ###
+    def _punctuation_usage(self, text):
+        punctuations = Counter(char for char in text if char in string.punctuation)
+        return sum(punctuations.values())
+
+
+    ### Features for Sentiment Analysis ###     
+
+    def _sentiment_indicator(self, text):
+        segments = self._split_into_segments(text)
+        # Process in batches
+        results = self.sentiment_pipeline(segments)
+
+        sentiment_score = sum(1 if res['label'] == 'POSITIVE' else -1 if res['label'] == 'NEGATIVE' else 0 for res in results)
+
+        if sentiment_score > 0:
+            return 1  # Positive sentiment
+        elif sentiment_score < 0:
+            return -1  # Negative sentiment
+        else:
+            return 0  # Neutral sentiment
+        
+    def _sentiment_strength(self, text):
+        segments = self._split_into_segments(text)
+        if not segments:
+            return 0  # Return early if no segments
+
+        # Process all segments at once using batch processing
+        results = self.sentiment_pipeline(segments)
+
+        # Initialize sentiment counts
+        sentiment_counts = {1: 0, -1: 0, 0: 0}
+
+        # Update counts based on results from the batch processing
+        for res in results:
+            label = 1 if res['label'] == 'POSITIVE' else -1 if res['label'] == 'NEGATIVE' else 0
+            sentiment_counts[label] += 1
+
+        # Return the count of the most predominant sentiment
+        predominant_sentiment = max(sentiment_counts, key=sentiment_counts.get)
+        return sentiment_counts[predominant_sentiment]
+
+    def _split_into_segments(self, text, max_length=510):
+        # Use the BERT tokenizer to tokenize the text and manage the max token length
+        tokens = self.tokenizer.tokenize(text)
+        segments = []
+        current_segment = []
+        
+        for token in tokens:
+            if len(current_segment) + 1 > max_length:
+                segments.append(self.tokenizer.convert_tokens_to_string(current_segment))
+                current_segment = []
+            current_segment.append(token)
+        
+        if current_segment:
+            segments.append(self.tokenizer.convert_tokens_to_string(current_segment))
+        
+        return segments
+
+    def _extract_features(self, text):
+        features = [
+            # Phraseology
+            self._document_length(text),
+            self._mean_sentence_length(text),
+            self._mean_word_length(text),
+            #Lexical
+            self._readability(text),
+            self._lexical_richness(text),   
+            self._semantic_repetition(text),      
+            self._function_words_frequency(text),
+            self._content_words_frequency(text),
+            # Punctuation
+            self._punctuation_usage(text),
+            # Sentiment
+            self._sentiment_indicator(text),
+            self._sentiment_strength(text),
+        ]
+        return features
+
+    def create_feature_matrix(self, directory):
+        features_matrix = []
+        for root, _, files in os.walk(directory):
+            for filename in files:
+                file_path = os.path.join(root, filename)
+                with open(file_path, 'r', encoding='utf-8') as f:
+                    text = f.read()
+                    features = self._extract_features(text)
+                    features_matrix.append(features)
+        return np.array(features_matrix)
+
@@ -0,0 +1,48 @@
+# Stylometry Approach for Detecting Writing Style Changes in Poetry text
+
+
+## Overview
+This project analyzes the stylistic changes in the poetry of Maya Angelou across different periods of her career using natural language processing techniques. The focus is on extracting and comparing stylistic topic modeling to understand the evolution of her writing style.
+
+## Project Structure
+- `DataProcessor.py`: Contains functions for data preprocessing and feature extraction.
+- `TextModeler.py`: Includes methods for text modeling and analysis.
+- `Stylometry.py`: Tools for performing stylometric analysis.
+- `Visualizer.py`: Provides visualization functions for the analysis results.
+- `SemanticRepetitionDetector.py`: Detects semantic repetitions using BERT model
+
+
+## Extracting the Project
+
+After downloading the NLP_project.zip file, extract it to your desired location.
+
+## Running the Analysis
+
+1. Navigate to the `NLP_project` folder.
+2. Ensure that the folders `initial state` and `final state` are present within the `Maya Angelou` folder.
+3. Run the main script file using Python. Ensure you have all the required dependencies installed.
+
+## Notes
+- The script uses relative paths to access the corpus data. It expects the corpus folders to be in the `Maya Angelou` directory within the root project directory.
+- If you encounter any path errors, please check that the folder structure matches the expected format and that the script is executed from the root project directory.
+
+
+## Installation
+To run this project, you will need to install the required Python libraries. You can install them using the following command:
+
+```bash
+pip install -r requirements.txt
+
+
+## IMPORTANT NOTE
+
+The function def _semantic_repetition(self,text) in the class DataProcessor
+will run some +5 minutes since it runs a BERT model which for each poem checks for semantic repetition.
+
+
+
+Daniel Adam, I.D. 342475639
+B.Sc. in Computer Science with specification in Data Science
+Topics in Natural Language Processing
+Ben Gurion University of the Negev
+
@@ -0,0 +1,27 @@
+from transformers import BertTokenizer, BertModel
+import torch
+from nltk.tokenize import sent_tokenize
+import numpy as np
+
+class SemanticRepetitionDetector:
+    def __init__(self):
+        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        self.model = BertModel.from_pretrained('bert-base-uncased')
+
+    def _get_embeddings(self, text):
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        outputs = self.model(**inputs)
+        return outputs.last_hidden_state.mean(1)
+
+    def count_repetitions(self, text, threshold=0.9):
+        sentences = sent_tokenize(text)
+        embeddings = [self._get_embeddings(sentence).detach().numpy() for sentence in sentences]
+        repetition_count = 0
+        
+        # Compare each sentence to every other sentence
+        for i in range(len(embeddings)):
+            for j in range(i + 1, len(embeddings)):
+                sim = np.dot(embeddings[i], embeddings[j].T) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[j]))
+                if sim > threshold:
+                    repetition_count += 1
+        return repetition_count
@@ -0,0 +1,83 @@
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.cluster import KMeans
+from sklearn.metrics import silhouette_score
+import os
+
+
+class Stylometry:
+    def __init__(self):
+        self.pca = None
+        self.kmeans = None
+        self.n_components = None
+        self.n_clusters = None
+        self.train_data = None
+        self.test_data = None
+        self.reduced_train_data = None
+        self.reduced_test_data = None
+        self.train_labels = None
+        self.test_labels = None
+
+    
+    def _choose_n_components(self, data, variance_threshold=0.95):
+        pca = PCA()
+        pca.fit(data)
+        cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
+        n_components = np.where(cumulative_variance >= variance_threshold)[0][0] + 1
+        return n_components
+
+    def _choose_n_clusters(self, data, k_range=range(2, 10)):
+        scores = []
+        ssd = []
+        for k in k_range:
+            kmeans = KMeans(n_clusters=k, random_state=42)
+            labels = kmeans.fit_predict(data)
+            score = silhouette_score(data, labels)
+            scores.append(score)
+            ssd.append(kmeans.inertia_)
+
+        
+        self._plot_elbow_method(ssd, k_range)
+        optimal_k = k_range[np.argmax(scores)]
+        return optimal_k
+
+    def _plot_elbow_method(self, ssd, k_range):
+        plt.figure(figsize=(8, 4))
+        plt.plot(k_range, ssd, 'bx-')
+        plt.xlabel('k (number of clusters)')
+        plt.ylabel('Sum of squared distances')
+        plt.title('Elbow Method For Optimal k')
+        plots_directory = r"Visualizations\Plots of clustering"
+        if not os.path.exists(plots_directory):
+            os.makedirs(plots_directory)
+        file_path = os.path.join(plots_directory, f'Elbow Method for Optimal k.png')
+        plt.savefig(file_path)   
+        plt.show()
+
+    def fit(self, train_data):
+        self.train_data = train_data
+        self.n_components = self._choose_n_components(train_data)
+        self.pca = PCA(n_components=self.n_components)
+        self.reduced_train_data = self.pca.fit_transform(train_data)
+        self.n_clusters = self._choose_n_clusters(self.reduced_train_data)
+        self.kmeans = KMeans(n_clusters=self.n_clusters)
+        self.train_labels = self.kmeans.fit_predict(self.reduced_train_data)
+
+    def predict(self, test_data):
+        if self.pca is None or self.kmeans is None:
+            raise ValueError("Must fit on train data before predicting.")
+        self.test_data = test_data
+        self.reduced_test_data = self.pca.transform(test_data)
+        self.test_labels = self.kmeans.predict(self.reduced_test_data)
+        return self.test_labels
+
+
+    def get_reduced_date(self):
+        return self.reduced_train_data, self.reduced_test_data
+    
+    def get_labels(self):
+        return self.train_labels, self.test_labels
+
+    
+