Skip to content

Commit

Permalink
tested with native dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
yaren.ozyer committed Jul 4, 2024
1 parent 60c784e commit 6a79a83
Show file tree
Hide file tree
Showing 6 changed files with 22,354 additions and 67 deletions.
19,333 changes: 19,333 additions & 0 deletions google_translate_train_aug_8_without_none.csv

Large diffs are not rendered by default.

2,149 changes: 2,149 additions & 0 deletions gpt_train_without_none.csv

Large diffs are not rendered by default.

260 changes: 260 additions & 0 deletions output.csv

Large diffs are not rendered by default.

17 changes: 17 additions & 0 deletions parse_turkish_tweets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
import re

def parse_turkish_tweets(text, target, stance, native_tweets, native_targets, native_stances):
cleaned_text = re.sub(r'<ENAMEX TYPE="[^"]+">([^<]+)</ENAMEX>', r'\1', text)

cleaned_text = re.sub(r'http\S+', '', cleaned_text)

cleaned_text = re.sub(r'@\S+', '', cleaned_text)
cleaned_text = re.sub(r'#\S+', '', cleaned_text)

sentences = re.split(r'\s*\n\s*', cleaned_text)
sentences = [sentence for sentence in sentences if sentence]
targets= [target for sentence in sentences if sentence]
stances = [stance for sentence in sentences if sentence]
native_tweets.extend(sentences)
native_targets.extend(targets)
native_stances.extend(stances)
Loading

0 comments on commit 6a79a83

Please sign in to comment.