Skip to content

Commit

Permalink
Extracted both ngram and unigram features.
Browse files Browse the repository at this point in the history
Tuned SVM with crossval, seperated by targets and trained. Got the results for each target's average, positive, negative.
  • Loading branch information
yaren.ozyer committed Apr 24, 2024
1 parent 1ea7492 commit 903cae5
Show file tree
Hide file tree
Showing 4 changed files with 1,120 additions and 1,010 deletions.
60 changes: 60 additions & 0 deletions results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
C:1.0 kernel: linear gamma:scale
SVM with 1,2,3 word n-grams and 3,4,5 character n-grams without crossval::::
Ateizm Accuracy: 76.47058823529412
Ateizm F Macro: 53.103448275862064
İklim Değişikliği Gerçek Bir Endişe Kaynağı Accuracy: 90.990990990991
İklim Değişikliği Gerçek Bir Endişe Kaynağı F Macro: 47.64150943396226
Feminist Hareket Accuracy: 70.09803921568627
Feminist Hareket F Macro: 63.81611468116658
Hillary Clinton Accuracy: 81.4207650273224
Hillary Clinton F Macro: 61.8936795688388
Kürtajın Yasallaştırılması Accuracy: 82.53012048192771
Kürtajın Yasallaştırılması F Macro: 69.28867623604465
-------------------------------------------------------------------
Best Parameters: {'C': 10, 'kernel': 'sigmoid'}

Ateizm Accuracy: 76.47058823529412
Ateizm F Macro: 53.103448275862064
Ateizm F1-Score (Positive Class): 86.20689655172413
Ateizm F1-Score (Negative Class): 20.0
İklim Değişikliği Gerçek Bir Endişe Kaynağı Accuracy: 90.990990990991
İklim Değişikliği Gerçek Bir Endişe Kaynağı F Macro: 47.64150943396226
İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Positive Class): 0.0
İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Negative Class): 95.28301886792453
Feminist Hareket Accuracy: 68.62745098039215
Feminist Hareket F Macro: 63.255656872678145
Feminist Hareket F1-Score (Positive Class): 77.30496453900709
Feminist Hareket F1-Score (Negative Class): 49.2063492063492
Hillary Clinton Accuracy: 81.4207650273224
Hillary Clinton F Macro: 66.98853989813243
Hillary Clinton F1-Score (Positive Class): 88.81578947368422
Hillary Clinton F1-Score (Negative Class): 45.16129032258064
Kürtajın Yasallaştırılması Accuracy: 83.13253012048193
Kürtajın Yasallaştırılması F Macro: 72.90111940298507
Kürtajın Yasallaştırılması F1-Score (Positive Class): 89.55223880597015
Kürtajın Yasallaştırılması F1-Score (Negative Class): 56.25

unigram

Ateizm Accuracy: 79.41176470588235
Ateizm F Macro: 55.1789077212806
Ateizm F1-Score (Positive Class): 88.13559322033898
Ateizm F1-Score (Negative Class): 22.22222222222222
İklim Değişikliği Gerçek Bir Endişe Kaynağı Accuracy: 90.990990990991
İklim Değişikliği Gerçek Bir Endişe Kaynağı F Macro: 47.64150943396226
İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Positive Class): 0.0
İklim Değişikliği Gerçek Bir Endişe Kaynağı F1-Score (Negative Class): 95.28301886792453
Feminist Hareket Accuracy: 60.29411764705882
Feminist Hareket F Macro: 54.46303083749001
Feminist Hareket F1-Score (Positive Class): 70.7581227436823
Feminist Hareket F1-Score (Negative Class): 38.16793893129771
Hillary Clinton Accuracy: 78.68852459016394
Hillary Clinton F Macro: 67.29896907216495
Hillary Clinton F1-Score (Positive Class): 86.5979381443299
Hillary Clinton F1-Score (Negative Class): 48.0
Kürtajın Yasallaştırılması Accuracy: 73.49397590361446
Kürtajın Yasallaştırılması F Macro: 64.93855606758832
Kürtajın Yasallaştırılması F1-Score (Positive Class): 82.25806451612904
Kürtajın Yasallaştırılması F1-Score (Negative Class): 47.61904761904761

-Negative class performs really bad compared to positive except climate change. so minority class performs bad?
107 changes: 80 additions & 27 deletions testSVM.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,23 +3,16 @@
from snowballstemmer import TurkishStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, f1_score
import pandas as pd
import nltk

#stemmer = TurkishStemmer()

print("wtfff")
def read_turkish_tweets(file):
df = pd.read_csv(file, encoding='windows-1254')
tweets = df["Tweet"].tolist()
targets = df["Target"].tolist()
stances = df["Stance"].tolist()
return tweets, stances, targets

def detect_stopwords():
#print("Detecting stopwords")
stopwords_df = pd.read_csv('turkish', header=None)
stop_words = stopwords_df[0].tolist()
#stop_words = stopwords.words('turkish')
Expand All @@ -30,7 +23,6 @@ def detect_stopwords():


def tokenize_tweet(tweet):
# Tokenization
tokens = word_tokenize(tweet)
stop_words = []
normalized_tokens = [token.lower() for token in tokens]
Expand All @@ -39,27 +31,88 @@ def tokenize_tweet(tweet):

return filtered_tokens

def extract_features_tfidf(tweets):

# Feature extraction: n-grams
def extract_features_tfidf_ngram(train_tweets, test_tweets):
#print("Extracting Features")
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
tfidf_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tweets])
word_train_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in train_tweets])
word_test_features = tfidf_vectorizer.transform([' '.join(tokens) for tokens in test_tweets])

char_tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 5), analyzer='char')
char_train_features = char_tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in train_tweets])
char_test_features = char_tfidf_vectorizer.transform([' '.join(tokens) for tokens in test_tweets])

return np.concatenate((word_train_features.toarray(), char_train_features.toarray()), axis=1), np.concatenate((word_test_features.toarray(), char_test_features.toarray()), axis=1)

def extract_features_tfidf_unigram(train_tweets, test_tweets):
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1), analyzer='word')
word_train_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in train_tweets])
word_test_features = tfidf_vectorizer.transform([' '.join(tokens) for tokens in test_tweets])

return word_train_features.toarray(), word_test_features.toarray()

def t_tweets(file):
#print("Reading File")
df = pd.read_csv(file, encoding='windows-1254')

tweets_by_target = {}
stances_by_target = {}

return tfidf_features.toarray()
unique_targets = df["Target"].unique()
for target in unique_targets:
target_df = df[df["Target"] == target]

tweets_by_target[target] = target_df["Tweet"].tolist()
stances_by_target[target] = target_df["Stance"].tolist()

return tweets_by_target, stances_by_target

def svm_for_target(tweets_train, stances_train, tweets_test, stances_test, target):
subtweets_train = tweets_train[target]
substances_train = stances_train[target]
subtweets_test = tweets_test[target]
substances_test = stances_test[target]

tokenized_train = [tokenize_tweet(tweet) for tweet in subtweets_train]
tokenized_test = [tokenize_tweet(tweet) for tweet in subtweets_test]

train_features, test_features = extract_features_tfidf_ngram(tokenized_train, tokenized_test)
param_grid = {
'C': [0.1, 1, 10, 100],
'kernel': ['rbf', 'linear', 'poly', 'sigmoid']
}

train_tweets, train_labels, train_targets = read_turkish_tweets('translated_train_without_none.csv')
# Perform GridSearchCV to find the best parameters
#grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=3)
#grid_search.fit(train_features, substances_train)

tokenized_tweets = [tokenize_tweet(tweet) for tweet in train_tweets]
# Get the best parameters
#best_params = grid_search.best_params_

# Train SVM with the best parameters
svm_classifier = SVC(kernel='sigmoid', C=10)
svm_classifier.fit(train_features, substances_train)

#print("Evaluating Results")
stance_pred = svm_classifier.predict(test_features)
accuracy = accuracy_score(substances_test, stance_pred)
f_macro = f1_score(substances_test, stance_pred, average='macro')
f1_positive = f1_score(substances_test, stance_pred, average=None)[0] # Positive class
f1_negative = f1_score(substances_test, stance_pred, average=None)[1] # Negative class

print(target + " Accuracy:", accuracy*100)
print(target + " F Macro: ", f_macro*100)
print(target + " F1-Score (Positive Class):", f1_positive * 100)
print(target + " F1-Score (Negative Class):", f1_negative * 100)


tweets_train, stances_train = t_tweets('translated_train_without_none.csv')

features = extract_features_tfidf(tokenized_tweets)
tweets_test, stances_test = t_tweets('translated_test_without_none.csv')

X_train, X_test, y_train, y_test = train_test_split(features, train_labels, test_size=0.2, random_state=42)
svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Ateizm")
svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "İklim Değişikliği Gerçek Bir Endişe Kaynağı")
svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Feminist Hareket")
svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Hillary Clinton")
svm_for_target(tweets_train, stances_train, tweets_test, stances_test, "Kürtajın Yasallaştırılması")

# Step 2: Train the SVM classifier
svm_classifier = SVC(kernel='linear') # You can choose different kernels based on your data
svm_classifier.fit(X_train, y_train)

# Step 3: Evaluate the classifier
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Loading

0 comments on commit 903cae5

Please sign in to comment.