.Rhistory

str_remove_all("http.*?( |$)") %>%
str_remove_all("\n") %>%
str_replace_all(pattern = "&amp;",replacement = "&") %>% #replaced with white space because of consecutive hashtags
str_replace_all("( )+", " ")# I might as well do all the preprocessing here actually...
spelling_mistakes<- hunspell(text = for_spell_checking_tweets,format = "text",ignore = to_ignore) %>% unlist()
spelling_suggestions<- hunspell_suggest(spelling_mistakes) %>% map(.,1) %>% set_names(nm = spelling_mistakes)
spelling_mistakes<- hunspell(text = for_spell_checking_tweets,format = "text",ignore = to_ignore) %>% unlist() %>% unique()
spelling_mistakes
spelling_suggestions<- hunspell_suggest(spelling_mistakes) %>% map(.,1) %>% set_names(nm = spelling_mistakes)
View(spelling_suggestions)
hunspell::list_dictionaries()
spelling_mistakes<- hunspell(text = for_spell_checking_tweets,format = "text",ignore = to_ignore,dict = dictionary("en_GB")) %>% unlist() %>% unique()
spelling_suggestions<- hunspell_suggest(spelling_mistakes) %>% map(.,1) %>% set_names(nm = spelling_mistakes)
View(spelling_suggestions)
spelling_mistakes<- hunspell_check(text = for_spell_checking_tweets,format = "text",ignore = to_ignore,dict = dictionary("en_GB")) %>% unlist() %>% unique()
no_emoji_tweets
library("qdapRegex")
install.packages(qdapRegex)
install.packages("qdapRegex")
library(qdapRegex)
sandbox<- c(tweet1 = "The EU commission has concluded an investigation against Poland due to recent events and found serious breaches of rule-of-law. Poland will be kicked out of the union",
tweet2 ="Amazing organization. I attended the first TTNET high-level dialog conference",
tweet3 ="ECB announced new measures against inflation",
tweet4 ="After consultation with relevant stakeholders, we decided to stop vaccination purchases",
tweet5 ="This day coudln't get any beteter!I received my missoin letter to ensure our European way of life #EU#OurWay!! Great honor to be part of vdL")
sandbox<- c(tweet1 = "The EU commission has concluded an investigation against Poland due to recent events and found serious breaches of rule-of-law. Poland will be kicked out of the union",
tweet2 ="Amazing organization. I attended the first TTNET high-level dialog conference",
tweet3 ="ECB announced new measures against inflation",
tweet4 ="After consultation with relevant stakeholders, we decided to stop vaccination purchases",
tweet5 ="This day couldn't get any better!I received my mission letter to ensure our European way of life #EU#OurWay!! Great honor to be part of vdL")
#get a feeling of the data structure after quanteda preprocessing:####
library(quanteda)
sandbox_corpus<- quanteda::corpus(sandbox)
docnames(sandbox_corpus)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
dfm(.,tolower = T,
remove = stopwords("en"),
stem = T)
stopwords_getsources()
stopwords::stopwords_getsources()
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
head(colnames(sandbox_dfm),10)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
head(colnames(sandbox_dfm),10)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = F) %>%
# tokens_compound(.,
#                 phrase(c("European Union","EU commission")),
#                 case_insensitive = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
head(colnames(sandbox_dfm),10)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = F) %>%
# tokens_compound(.,
#                 phrase(c("European Union","EU commission")),
#                 case_insensitive = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
head(colnames(sandbox_dfm),10)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
# tokens_compound(.,
#                 phrase(c("European Union","EU commission")),
#                 case_insensitive = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
head(colnames(sandbox_dfm),10)
head(colnames(sandbox_dfm),15)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
head(colnames(sandbox_dfm),15)
phrase(c("European Union","EU commission"))
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T)
sandbox_dfm
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
dfm(.,tolower = T)
sandbox_dfm
colnames(sandbox_dfm)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
colnames(sandbox_dfm)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>% tokens_remove(x = ., pattern = "[#][\\w_-]+")
sandbox_dfm
View(sandbox_dfm)
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>% tokens_remove(x = ., pattern = "[#][\\w_-]+")
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
sandbox_dfm<- tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
tokens_remove(x = ., pattern = "[#][\\w_-]+") %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
colnames(sandbox_dfm)
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus() %>%
tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
tokens_remove(x = ., pattern = "[#][\\w_-]+") %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
packs<- c("tidyverse","spacyr")
pacman::p_load(char = packs)
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus() %>%
tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
tokens_remove(x = ., pattern = "[#][\\w_-]+") %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+")
sandbox
sandbox_dfm
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus(.,docnames = names(sandbox))
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus(.,docnames = names(sandbox)) %>%
tokens(x = sandbox_corpus,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T)
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus(.,docnames = names(sandbox)) %>%
tokens(x = .,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T)
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus(.,docnames = names(sandbox)) %>%
tokens(x = .,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%
tokens_remove(x = ., pattern = "[#][\\w_-]+") %>%
dfm(.,tolower = T) %>%
dfm_remove(.,pattern = stopwords("en")) %>%
dfm_wordstem(.,language = "en")
sandbox_dfm
utils::download.file(url = "http://www.snsoroka.com/wp-content/uploads/2020/08/LTDjun2013.zip",destfile = "./dictionaries")
dir.exists("./dictionaries")
utils::download.file(url = "http://www.snsoroka.com/wp-content/uploads/2020/08/LTDjun2013.zip",destfile = "./dictionaries/")
utils::download.file(url = "http://www.snsoroka.com/wp-content/uploads/2020/08/LTDjun2013.zip",destfile = "./dictionaries/LTDjun2013.zip")
utils::unzip(zipfile = "./dictionaries/LTDjun2013.zip",exdir = "./dictionaries/topic_dictionary")
utils::unzip(zipfile = "./dictionaries/LTDjun2013.zip",exdir = "./dictionaries/")
utils::unzip(zipfile = "./dictionaries/LTDjun2013.zip",exdir = "./dictionaries")
utils::remove.file("./dictionaries/LTDjun2013.zip")
file.remove("./dictionaries/LTDjun2013.zip")
dict<- dictionary(x = "./dictionaries/LTDjun2013/policy_agendas_english.lcd")
dict<- dictionary(file = "./dictionaries/LTDjun2013/policy_agendas_english.lcd")
dict
typeof(dict)
names(dict)
dict[grepl(pattern = "rule*",x = dict)]
grep("rule*",dict)
grep("law",dict)
grep("law",dict,value = T)
grep("law",dict)
dict[[13]]
dict[[1]]
dict[13
]
dict[grep("rule",x = dict)]
dict[grep("rule",x = dict)[1]]
sandbox_policy_dfm <- sandbox_dfm %>% dfm_select(pattern = dict, selection = "keep", case_insensitive = T)
sandbox_policy_dfm
sandbox_policy_dfm <- sandbox_dfm %>% dfm_lookup(x = .,dictionary = dict,case_insensitive = T,capkeys = T,verbose = T)
sandbox_policy_dfm
sandbox_policy <- sandbox_dfm %>%
dfm_lookup(x = .,dictionary = dict,case_insensitive = T,capkeys = T,verbose = T) %>%
convert(.,to = "data.frame",docid_field = "tweet_id")
View(sandbox_policy)
sandbox<- c(tweet1 = "The EU commission has concluded an investigation against Poland due to recent events and found serious breaches of rule-of-law. Poland will be kicked out of the union",
tweet2 ="Amazing organization. I attended the first TTNET high-level dialog conference",
tweet3 ="ECB announced new measures against inflation",
tweet4 ="After consultation with relevant stakeholders, we decided to stop vaccination purchases",
tweet5 ="This day couldn't get any better!I received my mission letter to ensure our European way of life #EU#OurWay!! Great honor to be part of vdL")
#get a feeling of the data structure after quanteda preprocessing:####
library(quanteda)
sandbox_corpus<- quanteda::corpus(sandbox)
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus(.,docnames = names(sandbox)) %>%
tokens(x = .,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T,) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%tokens_tolower(.) %>%
tokens_remove(.,stopwords("en")) %>%
tokens_wordstem(.,language = "en") %>%
dfm()
packs<- c("tidyverse","spacyr")
pacman::p_load(char = packs)
sandbox_dfm<- sandbox %>%
str_remove_all("[#][\\w_-]+") %>%
corpus(.,docnames = names(sandbox)) %>%
tokens(x = .,remove_punct = T,
remove_symbols = T,
remove_numbers = T,
remove_url = T,
remove_separators = T,) %>%
tokens_compound(.,
phrase(c("European Union","EU commission")),
case_insensitive = T) %>%tokens_tolower(.) %>%
tokens_remove(.,stopwords("en")) %>%
tokens_wordstem(.,language = "en") %>%
dfm()
dict<- dictionary(file = "./dictionaries/LTDjun2013/policy_agendas_english.lcd")
sandbox_policy <- sandbox_dfm %>%
dfm_lookup(x = .,dictionary = dict,case_insensitive = T,capkeys = T,verbose = T) %>%
convert(.,to = "data.frame",docid_field = "tweet_id")
View(sandbox_policy)
dict["MACROECONOMICS"]
dict
names(dict)
dict[["macroeconomics"]]
dict[["macroeconomics"]]<-c(dict[["macroeconomics"]],"inflati")
dict[["macroeconomics"]]
sandbox_policy <- sandbox_dfm %>%
dfm_lookup(x = .,dictionary = dict,case_insensitive = T,capkeys = T,verbose = T) %>%
convert(.,to = "data.frame",docid_field = "tweet_id")
View(sandbox_policy)
sandbox_dfm
View(sandbox_dfm)
colnameS(sandbox_dfm)
colnames(sandbox_dfm)
dict<- dictionary(file = "./dictionaries/LTDjun2013/policy_agendas_english.lcd")
dict[["macroeconomics"]]<- c(dict[["macroeconomics"]],"inflat")
sandbox_policy <- sandbox_dfm %>%
dfm_lookup(x = .,dictionary = dict,case_insensitive = T,capkeys = T,verbose = T) %>%
convert(.,to = "data.frame",docid_field = "tweet_id")
View(sandbox_policy)
sandbox<- c(tweet1 = "The EU commission has concluded an investigation against Poland due to recent events and found serious breaches of rule-of-law. Poland will be kicked out of the union",
tweet2 ="Amazing organization. I attended the first TTNET high-level dialog conference",
tweet3 ="ECB announced new measures against inflation",
tweet4 ="After consultation with relevant stakeholders, we decided to stop vaccination purchases",
tweet5 ="This day couldn't get any better!I received my mission letter to ensure our European way of life #EU#OurWay!! Great honor to be part of vdL")
test_a<- str_remove_all(string = sandbox,pattern = "[#][\\w_-]+")
test_b<- gsub(pattern = "[#][\\w_-]+",replacement = "",x = sandbox)
names(test_a)
names(test_b)
test_a
test_b
test_b<- gsub(pattern = "[#][\\w_-]+",replacement = "",x = sandbox,perl = T)
names(test_b)
test_b
library(hunspell)
library(textcat) # 1.0-7, n-gram based language detection
library(cld2) # 1.2
library(cld3) # 1.
library(cld2) # 1.2
#also differing lengths of lists make it hard to vectorize the operation
#testing a for loop approach
to_ignore<- c("TTNET","ECB","OurWay","vdL")
sandbox_spellcheck<- hunspell(text = sandbox,format = "text",ignore = to_ignore)
sandbox_spellcheck<- hunspell(text = sandbox,format = "text",ignore = to_ignore) %>% unlist()
sandbox_misspell<- c(tweet1 = "The EU commission has concluded an investigation against Poland due to recent events and found serious breaches of rule-of-law. Poland will be kicked out of the union",
tweet2 ="Amazing organization. I attended the first TTNET high-level dialog conference",
tweet3 ="ECB announced newmeasures against inflation",
tweet4 ="After consultasion with relevant stakeholders, we decided to stop vaccination purchases",
tweet5 ="This day coulnd't get any better!I received my mission letter to ensure our European way of life #EU#OurWay!! Great honor to be part of vdL")
sandbox_spellcheck<- hunspell(text = sandbox_misspell,format = "text",ignore = to_ignore) %>% unlist()
sandbox_misspell<- c(tweet1 = "The EU commission has concluded an investigation against Poland due to recent events and found serious breaches of rule-of-law. Poland will be kicked out of the union",
tweet2 ="Amazing organization. I attended the first TTNET high-seviye dialog conference",
tweet3 ="ECB announced newmeasures against inflation",
tweet4 ="After consultasion with relevant stakeholders, we decided to stop vaccination purchases",
tweet5 ="This day coulnd't get any better!I received my mission letter to ensure our European way of life #EU#OurWay!! Great honor to be part of vdL")
to_ignore<- c("TTNET","ECB","OurWay","vdL")
sandbox_spellcheck<- hunspell(text = sandbox_misspell,format = "text",ignore = to_ignore) %>% unlist()
lang_det_test_1<- textcat::textcat(x = sandbox_spellcheck)
lang_det_test_2<- textcat::textcat(x = sandbox_spellcheck,p = ECIMCI_profiles)
lang_det_test_3<- cld2::detect_language_mixed(text = sandbox_spellcheck,plain_text = T)
View(lang_det_test_3)
lang_det_test_3
lang_det_test_3$classification
lang_det_test_3<- cld2::detect_language_mixed(text = sandbox_spellcheck[2],plain_text = T)#this needs looping over
View(lang_det_test_3)
lang_det_test_3<- cld2::detect_language_mixed(text = sandbox_spellcheck[3],plain_text = T)#this needs looping over
lang_det_test_3$classification
lang_det_test_4<- cld3::detect_language_mixed(text = sandbox_spellcheck[3])
View(lang_det_test_4)
lang_det_test_4<- cld3::detect_language_multi(text = sandbox_spellcheck[3])
cld3::detect_language_multi()
file_ftz = system.file("language_identification/lid.176.ftz", package = "fastText")
file_ftz = system.file("./dictionaries/lid.176.ftz", package = "fastText")
#fastText based on the blog post: https://www.r-bloggers.com/2021/05/language-identification-using-the-fasttext-package-a-benchmark/
install.packages("fastText")
library(fastText)
file_ftz = system.file("langugage_detection/lid.176.ftz", package = "fastText")
file_ftz = system.file("langugage_detection/lid.176.ftz", package = "fastText")
model_pat<-("./dictionaries/lid.176.ftz")
lang_det_test_ft = fastText::language_identification(input_obj = sandbox_spellcheck,
pre_trained_language_model_path = model_pat,
k = 1,
th = 0.0,
threads = 1,
verbose = TRUE)
View(lang_det_test_ft)
small_model<-("./dictionaries/lid.176.ftz")
lang_det_test_ft <- fastText::language_identification(input_obj = sandbox_spellcheck,
pre_trained_language_model_path = small_model,
k = 3,
th = 0.0,
threads = 1,
verbose = TRUE)
View(lang_det_test_ft)
large_model<- (".7dictionaries/lid.176.bin")
lang_det_test_ft_large<- fastText::language_identification(input_obj = sandbox_spellcheck,
pre_trained_language_model_path = large_model,
k = 3,
th = 0,
threads = 1,
verbose = T)
large_model<- ("./dictionaries/lid.176.bin")
lang_det_test_ft_large<- fastText::language_identification(input_obj = sandbox_spellcheck,
pre_trained_language_model_path = large_model,
k = 3,
th = 0,
threads = 1,
verbose = T)
View(lang_det_test_ft_large)
lang_det_test_ft <- fastText::language_identification(input_obj = sandbox_spellcheck,
pre_trained_language_model_path = small_model,
k = 3,
th = 0.0,
threads = 1,
verbose = TRUE) %>% as_tibble(rownames = "word_id")
View(lang_det_test_ft)
lang_det_test_ft_large<- fastText::language_identification(input_obj = sandbox_spellcheck,
pre_trained_language_model_path = large_model,
k = 3,
th = 0,
threads = 1,
verbose = T) %>% as_tibble(rownames = "word_id")
View(lang_det_test_ft)
View(lang_det_test_ft_large)
packs<- c("tidyverse","spacyr")
pacman::p_load(char = packs)
data.path<- paste0(getwd(),"/analysis_data/")
data_id<- readRDS(paste0(data.path,"EU_corpus_sample.RDS")) %>% pull(tweet_id)
data<- readRDS(file.choose()) %>% filter(id %in% data_id)
tweets<- data %>% filter(lang == "en") %>% pull(text)
tweet_ids <- data %>% filter(lang == "en") %>% pull(id)
political_respons_example<-data %>% filter(id == "1001044775120916480") %>% pull(text)
tweets<-gsub("http.*?( |$)","",tweets,perl = T)
hashtags<- str_extract_all(string = tweets,pattern = "[#][\\w_-]+") %>% unlist() %>% unique()
mentions<- str_extract_all(string = tweets,pattern = "[@][\\w_-]+") %>% unlist() %>% unique()
to_ignore<- c(hashtags,mentions)
tweets_spellcheck <- hunspell(text = tweets, format = "text",ignore = to_ignore)
View(tweets_spellcheck)
tweets[7]
to_ignore
mentions<- str_extract_all(string = tweets,pattern = "[@][\\w_-]+") %>% unlist() %>% unique()
mentions
to_ignore<- c(hashtags,mentions)
mentions
tweets_spellcheck <- hunspell(text = tweets, format = "text",ignore = to_ignore)
View(tweets_spellcheck)
to_ignore
mentions
#interesting, some mentions are not ignored
tweets_spellcheck <- hunspell(text = tweets, format = "text",ignore = to_ignore) %>% unlist()
rl_text_cat<- textcat::textcat(x = tweets_spellcheck)
rl_text_cat
rl_text_cat<- data.frame(word = tweets_spellcheck,
language = textcat::textcat(x = tweets_spellcheck))
View(rl_text_cat)
rl_text_cat_ext<- data.frame(word = tweets_spellcheck,
language = textcat::textcat(x = tweets_spellcheck,p = ECIMCI_profiles))
View(rl_text_cat_ext)
rl_ft_small<- fastText::language_identification(input_obj = tweets_spellcheck,
pre_trained_language_model_path = small_model) %>% rbind(.,tweets_spellcheck)
rl_ft_small<- fastText::language_identification(input_obj = tweets_spellcheck,
pre_trained_language_model_path = small_model)
View(rl_ft_small)
rl_ft_small<- fastText::language_identification(input_obj = tweets_spellcheck,
pre_trained_language_model_path = small_model) %>% cbind(.,tweets_spellcheck)
View(rl_ft_small)
rl_ft_large<- fastText::language_identification(input_obj = tweets_spellcheck,
pre_trained_language_model_path = large_model) %>% cbind(.,tweets_spellcheck)
View(rl_ft_large)
glimpse(tweets)
hashtags<- str_extract_all(string = tweets,pattern = "[#][\\w_-]+|[@][\\w_-]+") %>% unlist() %>% unique()
cat("spellchecking the tweets")
View(rl_ft_large)
spacy_initialize()
spacy_tokenize(x = sandbox,what = "sentence")
a<-spacy_tokenize(x = sandbox,what = "sentence")
View(a)
a<-spacy_tokenize(x = sandbox,what = "sentence",output = "data.frame")
View(a)
a<-spacy_parse(x = sandbox,pos = T,tag = T,lemma = T,entity = T)
a<-spacy_parse(x = sandbox,pos = T,tag = T,lemma = T,entity = T,dependency = T)
a<-spacy_parse(x = political_respons_example,pos = T,tag = T,lemma = T,entity = T,dependency = T)
a
file.exists("./dictionaries/lid.176.bin")