Skip to content

Commit

Permalink
translated the test set and changed the feature extraction only to wo…
Browse files Browse the repository at this point in the history
…rd ngrams.
  • Loading branch information
YAREN BUSE OZYER committed Apr 24, 2024
1 parent 75b3079 commit 1ea7492
Show file tree
Hide file tree
Showing 3 changed files with 1,012 additions and 20 deletions.
44 changes: 24 additions & 20 deletions testSVM.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,21 @@
from snowballstemmer import TurkishStemmer
import string
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import pandas as pd
import nltk

#stemmer = TurkishStemmer()

print("wtfff")
def read_turkish_tweets(file):
df = pd.read_csv(file, encoding='windows-1254')
tweets = df["Tweet"].tolist()
targets = df["Target"].tolist()
return tweets, targets
stances = df["Stance"].tolist()
return tweets, stances, targets

def detect_stopwords():
stopwords_df = pd.read_csv('turkish', header=None)
Expand All @@ -27,35 +32,34 @@ def detect_stopwords():
def tokenize_tweet(tweet):
# Tokenization
tokens = word_tokenize(tweet)
stop_words = detect_stopwords()
stop_words = []
normalized_tokens = [token.lower() for token in tokens]
filtered_tokens = [token for token in normalized_tokens if (token not in stop_words and not token.startswith("http"))]
#stemmed_tokens = [stemmer.stemWord(token) for token in filtered_tokens]

return filtered_tokens

def extract_features_tfidf(tweets):
# Tokenization and preprocessing
tokenized_tweets = [tokenize_tweet(tweet) for tweet in tweets]

# Feature extraction: n-grams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), analyzer='word')
tfidf_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_tweets])

# Feature extraction: character n-grams
char_tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 5), analyzer='char')
char_tfidf_features = char_tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tokenized_tweets])

# Feature extraction: sentiment lexicon features, target presence/absence, POS tags, encodings
# These features remain the same as before
tfidf_features = tfidf_vectorizer.fit_transform([' '.join(tokens) for tokens in tweets])

# Combine all features
all_features = np.concatenate((tfidf_features.toarray(), char_tfidf_features.toarray()), axis=1)
np.savetxt('feature_matrix.csv', all_features, delimiter=',')
return all_features
return tfidf_features.toarray()

train_tweets, train_labels, train_targets = read_turkish_tweets('translated_train_without_none.csv')

tokenized_tweets = [tokenize_tweet(tweet) for tweet in train_tweets]

features = extract_features_tfidf(tokenized_tweets)

train_tweets, train_targets = read_turkish_tweets('translated_train_without_none.csv')
X_train, X_test, y_train, y_test = train_test_split(features, train_labels, test_size=0.2, random_state=42)

print(extract_features_tfidf(train_tweets))
# Step 2: Train the SVM classifier
svm_classifier = SVC(kernel='linear') # You can choose different kernels based on your data
svm_classifier.fit(X_train, y_train)

#print(tokenize_tweet(train_tweets[0]))
# Step 3: Evaluate the classifier
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
Loading

0 comments on commit 1ea7492

Please sign in to comment.