Extracted both ngram and unigram features.

Tuned SVM with crossval, seperated by targets and trained. Got the results for each target's average, positive, negative.
yarenozyer · Apr 24, 2024 · 903cae5 · 903cae5
1 parent 1ea7492
commit 903cae5
Show file tree

Hide file tree

Showing 4 changed files with 1,120 additions and 1,010 deletions.
diff --git a/results.txt b/results.txt
@@ -0,0 +1,60 @@
+C:1.0 kernel: linear gamma:scale
+SVM with 1,2,3 word n-grams and 3,4,5 character n-grams without crossval::::
+Ateizm Accuracy: 76.47058823529412
+Ateizm F Macro:  53.103448275862064
+İklim Değişikliği Gerçek Bir Endişe Kaynağı Accuracy: 90.990990990991
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F Macro:  47.64150943396226
+Feminist Hareket Accuracy: 70.09803921568627
+Feminist Hareket F Macro:  63.81611468116658
+Hillary Clinton Accuracy: 81.4207650273224
+Hillary Clinton F Macro:  61.8936795688388
+Kürtajın Yasallaştırılması Accuracy: 82.53012048192771
+Kürtajın Yasallaştırılması F Macro:  69.28867623604465
+-------------------------------------------------------------------
+Best Parameters: {'C': 10, 'kernel': 'sigmoid'}
+
+Ateizm Accuracy: 76.47058823529412
+Ateizm F Macro:  53.103448275862064
+Ateizm F1-Score (Positive Class): 86.20689655172413
+Ateizm F1-Score (Negative Class): 20.0
+İklim Değişikliği Gerçek Bir Endişe Kaynağı Accuracy: 90.990990990991
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F Macro:  47.64150943396226
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Positive Class): 0.0
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Negative Class): 95.28301886792453
+Feminist Hareket Accuracy: 68.62745098039215
+Feminist Hareket F Macro:  63.255656872678145
+Feminist Hareket F1-Score (Positive Class): 77.30496453900709
+Feminist Hareket F1-Score (Negative Class): 49.2063492063492
+Hillary Clinton Accuracy: 81.4207650273224
+Hillary Clinton F Macro:  66.98853989813243
+Hillary Clinton F1-Score (Positive Class): 88.81578947368422
+Hillary Clinton F1-Score (Negative Class): 45.16129032258064
+Kürtajın Yasallaştırılması Accuracy: 83.13253012048193
+Kürtajın Yasallaştırılması F Macro:  72.90111940298507
+Kürtajın Yasallaştırılması F1-Score (Positive Class): 89.55223880597015
+Kürtajın Yasallaştırılması F1-Score (Negative Class): 56.25
+
+unigram
+
+Ateizm Accuracy: 79.41176470588235
+Ateizm F Macro:  55.1789077212806
+Ateizm F1-Score (Positive Class): 88.13559322033898
+Ateizm F1-Score (Negative Class): 22.22222222222222
+İklim Değişikliği Gerçek Bir Endişe Kaynağı Accuracy: 90.990990990991
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F Macro:  47.64150943396226
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Positive Class): 0.0
+İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Negative Class): 95.28301886792453
+Feminist Hareket Accuracy: 60.29411764705882
+Feminist Hareket F Macro:  54.46303083749001
+Feminist Hareket F1-Score (Positive Class): 70.7581227436823
+Feminist Hareket F1-Score (Negative Class): 38.16793893129771
+Hillary Clinton Accuracy: 78.68852459016394
+Hillary Clinton F Macro:  67.29896907216495
+Hillary Clinton F1-Score (Positive Class): 86.5979381443299
+Hillary Clinton F1-Score (Negative Class): 48.0
+Kürtajın Yasallaştırılması Accuracy: 73.49397590361446
+Kürtajın Yasallaştırılması F Macro:  64.93855606758832
+Kürtajın Yasallaştırılması F1-Score (Positive Class): 82.25806451612904
+Kürtajın Yasallaştırılması F1-Score (Negative Class): 47.61904761904761
+
+-Negative class performs really bad compared to positive except climate change. so minority class performs bad?
diff --git a/testSVM.py b/testSVM.py
@@ -3,23 +3,16 @@
 from snowballstemmer import TurkishStemmer
 import string
 from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.model_selection import train_test_split
+from sklearn.model_selection import GridSearchCV 
 from sklearn.svm import SVC
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, f1_score
 import pandas as pd
 import nltk
 
 #stemmer = TurkishStemmer()
-
-print("wtfff")
-def read_turkish_tweets(file):
-    df = pd.read_csv(file, encoding='windows-1254')
-    tweets = df["Tweet"].tolist()
-    targets = df["Target"].tolist()
-    stances = df["Stance"].tolist()
-    return tweets, stances, targets
 
 def detect_stopwords():
+    #print("Detecting stopwords")
     stopwords_df = pd.read_csv('turkish', header=None)
     stop_words = stopwords_df[0].tolist()
     #stop_words = stopwords.words('turkish')
@@ -30,7 +23,6 @@ def detect_stopwords():
 
 
 def tokenize_tweet(tweet):
-    # Tokenization
     tokens = word_tokenize(tweet)
     stop_words = []
     normalized_tokens = [token.lower() for token in tokens]
@@ -39,27 +31,88 @@ def tokenize_tweet(tweet):
 
     return filtered_tokens
 
-def extract_features_tfidf(tweets):
-
-    # Feature extraction: n-grams
+def extract_features_tfidf_ngram(train_tweets, test_tweets):
+    #print("Extracting Features")
     tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
-    tfidf_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tweets])
+    word_train_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in train_tweets])
+    word_test_features = tfidf_vectorizer.transform([' '.join(tokens) for tokens in test_tweets])
+
+    char_tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 5), analyzer='char')
+    char_train_features = char_tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in train_tweets])
+    char_test_features = char_tfidf_vectorizer.transform([' '.join(tokens) for tokens in test_tweets])
+
+    return np.concatenate((word_train_features.toarray(), char_train_features.toarray()), axis=1), np.concatenate((word_test_features.toarray(), char_test_features.toarray()), axis=1)
+
+def extract_features_tfidf_unigram(train_tweets, test_tweets):
+    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), analyzer='word')
+    word_train_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in train_tweets])
+    word_test_features = tfidf_vectorizer.transform([' '.join(tokens) for tokens in test_tweets])
+
+    return word_train_features.toarray(), word_test_features.toarray()
+
+def t_tweets(file):
+    #print("Reading File")
+    df = pd.read_csv(file, encoding='windows-1254')
+
+    tweets_by_target = {}
+    stances_by_target = {}
 
-    return tfidf_features.toarray()
+    unique_targets = df["Target"].unique()
+    for target in unique_targets:
+        target_df = df[df["Target"] == target]
+
+        tweets_by_target[target] = target_df["Tweet"].tolist()
+        stances_by_target[target] = target_df["Stance"].tolist()
+
+    return tweets_by_target, stances_by_target
+
+def svm_for_target(tweets_train, stances_train, tweets_test, stances_test, target):
+    subtweets_train = tweets_train[target]
+    substances_train = stances_train[target]
+    subtweets_test = tweets_test[target]
+    substances_test = stances_test[target]
+
+    tokenized_train = [tokenize_tweet(tweet) for tweet in subtweets_train]
+    tokenized_test = [tokenize_tweet(tweet) for tweet in subtweets_test]
+
+    train_features, test_features = extract_features_tfidf_ngram(tokenized_train, tokenized_test)
+    param_grid = {
+        'C': [0.1, 1, 10, 100],
+        'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
+    }
 
-train_tweets, train_labels, train_targets = read_turkish_tweets('translated_train_without_none.csv')
+    # Perform GridSearchCV to find the best parameters
+    #grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
+    #grid_search.fit(train_features, substances_train)
 
-tokenized_tweets = [tokenize_tweet(tweet) for tweet in train_tweets]
+    # Get the best parameters
+    #best_params = grid_search.best_params_
+
+    # Train SVM with the best parameters
+    svm_classifier = SVC(kernel='sigmoid', C=10)
+    svm_classifier.fit(train_features, substances_train)
+
+    #print("Evaluating Results")
+    stance_pred = svm_classifier.predict(test_features)
+    accuracy = accuracy_score(substances_test, stance_pred)
+    f_macro = f1_score(substances_test, stance_pred, average='macro')
+    f1_positive = f1_score(substances_test, stance_pred, average=None)[0]  # Positive class
+    f1_negative = f1_score(substances_test, stance_pred, average=None)[1]  # Negative class
+
+    print(target + " Accuracy:", accuracy*100)
+    print(target + " F Macro: ", f_macro*100)
+    print(target + " F1-Score (Positive Class):", f1_positive * 100)
+    print(target + " F1-Score (Negative Class):", f1_negative * 100)
+
+
+tweets_train, stances_train = t_tweets('translated_train_without_none.csv')
 
-features = extract_features_tfidf(tokenized_tweets)
+tweets_test, stances_test = t_tweets('translated_test_without_none.csv')
 
-X_train, X_test, y_train, y_test = train_test_split(features, train_labels, test_size=0.2, random_state=42)
+svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Ateizm")
+svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "İklim Değişikliği Gerçek Bir Endişe Kaynağı")
+svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Feminist Hareket")
+svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Hillary Clinton")
+svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Kürtajın Yasallaştırılması")
 
-# Step 2: Train the SVM classifier
-svm_classifier = SVC(kernel='linear')  # You can choose different kernels based on your data
-svm_classifier.fit(X_train, y_train)
 
-# Step 3: Evaluate the classifier
-y_pred = svm_classifier.predict(X_test)
-accuracy = accuracy_score(y_test, y_pred)
-print("Accuracy:", accuracy)